2008-05-26 Eric Blake <ebb9@byu.net>

Optimize the generic and x86 memset.
        * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.
        * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.  Prefer
        8-byte over 4-byte alignment.  Reduce register pressure.
This commit is contained in:
Jeff Johnston 2008-05-26 23:23:15 +00:00
parent cae28869c1
commit a6bd72a278
3 changed files with 85 additions and 43 deletions

View File

@ -1,3 +1,12 @@
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized.
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized. Prefer
8-byte over 4-byte alignment. Reduce register pressure.
2008-05-26 Eric Blake <ebb9@byu.net> 2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 strlen. Optimize the generic and x86 strlen.

View File

@ -1,6 +1,6 @@
/* /*
* ==================================================== * ====================================================
* Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved. * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
* *
* Permission to use, copy, modify, and distribute this * Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice * software is freely granted, provided that this notice
@ -18,33 +18,74 @@ SYM (memset):
pushl ebp pushl ebp
movl esp,ebp movl esp,ebp
pushl edi pushl edi
pushl ebx
movl 8(ebp),edi movl 8(ebp),edi
movl 12(ebp),eax movl 12(ebp),eax
movl 16(ebp),ecx movl 16(ebp),ecx
cld cld
#ifndef __OPTIMIZE_SIZE__ #ifndef __OPTIMIZE_SIZE__
andl $255,eax /* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
movl ecx,ebx
testl $3,edi
jne .L19
cmpl $16,ecx cmpl $16,ecx
jbe .L19 jbe .L19
cbw
testl $7,edi
je .L10
movl eax,edx /* It turns out that 8-byte aligned 'rep stosl' outperforms
sall $8,eax 4-byte aligned on some x86 platforms. */
orl edx,eax movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
/* At this point, ecx>8 and edi%8==0. */
.L10:
movb al,ah
movl eax,edx movl eax,edx
sall $16,edx sall $16,edx
orl edx,eax orl edx,eax
movl ecx,edx
shrl $2,ecx shrl $2,ecx
andl $3,ebx andl $3,edx
rep rep
stosl stosl
movl ebx,ecx movl edx,ecx
#endif /* not __OPTIMIZE_SIZE__ */ #endif /* not __OPTIMIZE_SIZE__ */
.L19: .L19:
@ -53,8 +94,7 @@ SYM (memset):
movl 8(ebp),eax movl 8(ebp),eax
leal -8(ebp),esp leal -4(ebp),esp
popl ebx
popl edi popl edi
leave leave
ret ret

View File

@ -22,7 +22,7 @@ DESCRIPTION
pointed to by <[dst]> to the value. pointed to by <[dst]> to the value.
RETURNS RETURNS
<<memset>> returns the value of <[m]>. <<memset>> returns the value of <[dst]>.
PORTABILITY PORTABILITY
<<memset>> is ANSI C. <<memset>> is ANSI C.
@ -45,42 +45,36 @@ _DEFUN (memset, (m, c, n),
int c _AND int c _AND
size_t n) size_t n)
{ {
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
char *s = (char *) m; char *s = (char *) m;
while (n-- != 0) #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
{
*s++ = (char) c;
}
return m;
#else
char *s = (char *) m;
int i; int i;
unsigned long buffer; unsigned long buffer;
unsigned long *aligned_addr; unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */ unsigned variable. */
if (!TOO_SMALL (n) && !UNALIGNED (m)) while (UNALIGNED (s))
{ {
/* If we get this far, we know that n is large and m is word-aligned. */ if (n--)
aligned_addr = (unsigned long*)m; *s++ = (char) c;
else
return m;
}
if (!TOO_SMALL (n))
{
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that /* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */ we can set large blocks quickly. */
if (LBLOCKSIZE == 4)
{
buffer = (d << 8) | d; buffer = (d << 8) | d;
buffer |= (buffer << 16); buffer |= (buffer << 16);
} for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
else buffer = (buffer << i) | buffer;
{
buffer = 0;
for (i = 0; i < LBLOCKSIZE; i++)
buffer = (buffer << 8) | d;
}
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4) while (n >= LBLOCKSIZE*4)
{ {
*aligned_addr++ = buffer; *aligned_addr++ = buffer;
@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
s = (char*)aligned_addr; s = (char*)aligned_addr;
} }
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--) while (n--)
{ *s++ = (char) c;
*s++ = (char)d;
}
return m; return m;
#endif /* not PREFER_SIZE_OVER_SPEED */
} }