2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset. * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. Prefer 8-byte over 4-byte alignment. Reduce register pressure.
This commit is contained in:
parent
cae28869c1
commit
a6bd72a278
|
@ -1,3 +1,12 @@
|
||||||
|
2008-05-26 Eric Blake <ebb9@byu.net>
|
||||||
|
|
||||||
|
Optimize the generic and x86 memset.
|
||||||
|
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
|
||||||
|
Pre-align pointer so unaligned stores aren't penalized.
|
||||||
|
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
|
||||||
|
Pre-align pointer so unaligned stores aren't penalized. Prefer
|
||||||
|
8-byte over 4-byte alignment. Reduce register pressure.
|
||||||
|
|
||||||
2008-05-26 Eric Blake <ebb9@byu.net>
|
2008-05-26 Eric Blake <ebb9@byu.net>
|
||||||
|
|
||||||
Optimize the generic and x86 strlen.
|
Optimize the generic and x86 strlen.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/*
|
/*
|
||||||
* ====================================================
|
* ====================================================
|
||||||
* Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
|
* Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* Permission to use, copy, modify, and distribute this
|
* Permission to use, copy, modify, and distribute this
|
||||||
* software is freely granted, provided that this notice
|
* software is freely granted, provided that this notice
|
||||||
|
@ -18,43 +18,83 @@ SYM (memset):
|
||||||
pushl ebp
|
pushl ebp
|
||||||
movl esp,ebp
|
movl esp,ebp
|
||||||
pushl edi
|
pushl edi
|
||||||
pushl ebx
|
|
||||||
movl 8(ebp),edi
|
movl 8(ebp),edi
|
||||||
movl 12(ebp),eax
|
movl 12(ebp),eax
|
||||||
movl 16(ebp),ecx
|
movl 16(ebp),ecx
|
||||||
cld
|
cld
|
||||||
|
|
||||||
#ifndef __OPTIMIZE_SIZE__
|
#ifndef __OPTIMIZE_SIZE__
|
||||||
andl $255,eax
|
/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
|
||||||
movl ecx,ebx
|
|
||||||
testl $3,edi
|
|
||||||
jne .L19
|
|
||||||
cmpl $16,ecx
|
cmpl $16,ecx
|
||||||
jbe .L19
|
jbe .L19
|
||||||
|
cbw
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
movl eax,edx
|
/* It turns out that 8-byte aligned 'rep stosl' outperforms
|
||||||
sall $8,eax
|
4-byte aligned on some x86 platforms. */
|
||||||
orl edx,eax
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
testl $7,edi
|
||||||
|
je .L10
|
||||||
|
|
||||||
|
movb al,(edi)
|
||||||
|
incl edi
|
||||||
|
decl ecx
|
||||||
|
|
||||||
|
/* At this point, ecx>8 and edi%8==0. */
|
||||||
|
.L10:
|
||||||
|
movb al,ah
|
||||||
movl eax,edx
|
movl eax,edx
|
||||||
sall $16,edx
|
sall $16,edx
|
||||||
orl edx,eax
|
orl edx,eax
|
||||||
|
|
||||||
|
movl ecx,edx
|
||||||
shrl $2,ecx
|
shrl $2,ecx
|
||||||
andl $3,ebx
|
andl $3,edx
|
||||||
rep
|
rep
|
||||||
stosl
|
stosl
|
||||||
movl ebx,ecx
|
movl edx,ecx
|
||||||
#endif /* not __OPTIMIZE_SIZE__ */
|
#endif /* not __OPTIMIZE_SIZE__ */
|
||||||
|
|
||||||
.L19:
|
.L19:
|
||||||
rep
|
rep
|
||||||
stosb
|
stosb
|
||||||
|
|
||||||
movl 8(ebp),eax
|
movl 8(ebp),eax
|
||||||
|
|
||||||
leal -8(ebp),esp
|
leal -4(ebp),esp
|
||||||
popl ebx
|
|
||||||
popl edi
|
popl edi
|
||||||
leave
|
leave
|
||||||
ret
|
ret
|
||||||
|
|
|
@ -22,7 +22,7 @@ DESCRIPTION
|
||||||
pointed to by <[dst]> to the value.
|
pointed to by <[dst]> to the value.
|
||||||
|
|
||||||
RETURNS
|
RETURNS
|
||||||
<<memset>> returns the value of <[m]>.
|
<<memset>> returns the value of <[dst]>.
|
||||||
|
|
||||||
PORTABILITY
|
PORTABILITY
|
||||||
<<memset>> is ANSI C.
|
<<memset>> is ANSI C.
|
||||||
|
@ -39,48 +39,42 @@ QUICKREF
|
||||||
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
|
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
|
||||||
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
|
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
|
||||||
|
|
||||||
_PTR
|
_PTR
|
||||||
_DEFUN (memset, (m, c, n),
|
_DEFUN (memset, (m, c, n),
|
||||||
_PTR m _AND
|
_PTR m _AND
|
||||||
int c _AND
|
int c _AND
|
||||||
size_t n)
|
size_t n)
|
||||||
{
|
{
|
||||||
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
|
|
||||||
char *s = (char *) m;
|
char *s = (char *) m;
|
||||||
|
|
||||||
while (n-- != 0)
|
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
|
||||||
{
|
|
||||||
*s++ = (char) c;
|
|
||||||
}
|
|
||||||
|
|
||||||
return m;
|
|
||||||
#else
|
|
||||||
char *s = (char *) m;
|
|
||||||
int i;
|
int i;
|
||||||
unsigned long buffer;
|
unsigned long buffer;
|
||||||
unsigned long *aligned_addr;
|
unsigned long *aligned_addr;
|
||||||
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
|
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
|
||||||
unsigned variable. */
|
unsigned variable. */
|
||||||
|
|
||||||
if (!TOO_SMALL (n) && !UNALIGNED (m))
|
while (UNALIGNED (s))
|
||||||
{
|
{
|
||||||
/* If we get this far, we know that n is large and m is word-aligned. */
|
if (n--)
|
||||||
aligned_addr = (unsigned long*)m;
|
*s++ = (char) c;
|
||||||
|
else
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!TOO_SMALL (n))
|
||||||
|
{
|
||||||
|
/* If we get this far, we know that n is large and s is word-aligned. */
|
||||||
|
aligned_addr = (unsigned long *) s;
|
||||||
|
|
||||||
/* Store D into each char sized location in BUFFER so that
|
/* Store D into each char sized location in BUFFER so that
|
||||||
we can set large blocks quickly. */
|
we can set large blocks quickly. */
|
||||||
if (LBLOCKSIZE == 4)
|
buffer = (d << 8) | d;
|
||||||
{
|
buffer |= (buffer << 16);
|
||||||
buffer = (d << 8) | d;
|
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
|
||||||
buffer |= (buffer << 16);
|
buffer = (buffer << i) | buffer;
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buffer = 0;
|
|
||||||
for (i = 0; i < LBLOCKSIZE; i++)
|
|
||||||
buffer = (buffer << 8) | d;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/* Unroll the loop. */
|
||||||
while (n >= LBLOCKSIZE*4)
|
while (n >= LBLOCKSIZE*4)
|
||||||
{
|
{
|
||||||
*aligned_addr++ = buffer;
|
*aligned_addr++ = buffer;
|
||||||
|
@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
|
||||||
s = (char*)aligned_addr;
|
s = (char*)aligned_addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif /* not PREFER_SIZE_OVER_SPEED */
|
||||||
|
|
||||||
while (n--)
|
while (n--)
|
||||||
{
|
*s++ = (char) c;
|
||||||
*s++ = (char)d;
|
|
||||||
}
|
|
||||||
|
|
||||||
return m;
|
return m;
|
||||||
#endif /* not PREFER_SIZE_OVER_SPEED */
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue