summaryrefslogtreecommitdiff
path: root/libc/sysdeps/i386
diff options
context:
space:
mode:
Diffstat (limited to 'libc/sysdeps/i386')
-rw-r--r--libc/sysdeps/i386/bits/select.h6
-rwxr-xr-xlibc/sysdeps/i386/configure25
-rw-r--r--libc/sysdeps/i386/configure.in11
-rw-r--r--libc/sysdeps/i386/dl-machine.h67
-rwxr-xr-xlibc/sysdeps/i386/elf/configure11
-rw-r--r--libc/sysdeps/i386/elf/configure.in6
-rw-r--r--libc/sysdeps/i386/fpu/e_acos.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_acosf.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_acosh.S7
-rw-r--r--libc/sysdeps/i386/fpu/e_acoshf.S7
-rw-r--r--libc/sysdeps/i386/fpu/e_acoshl.S7
-rw-r--r--libc/sysdeps/i386/fpu/e_acosl.c1
-rw-r--r--libc/sysdeps/i386/fpu/e_asin.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_asinf.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_atan2.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_atan2f.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_atan2l.c1
-rw-r--r--libc/sysdeps/i386/fpu/e_atanh.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_atanhf.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_atanhl.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_exp.S17
-rw-r--r--libc/sysdeps/i386/fpu/e_exp10.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_exp10f.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_exp10l.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_exp2.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_exp2f.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_exp2l.S1
-rw-r--r--libc/sysdeps/i386/fpu/e_expf.S17
-rw-r--r--libc/sysdeps/i386/fpu/e_expl.c3
-rw-r--r--libc/sysdeps/i386/fpu/e_fmod.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_fmodf.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_fmodl.c1
-rw-r--r--libc/sysdeps/i386/fpu/e_hypot.S5
-rw-r--r--libc/sysdeps/i386/fpu/e_hypotf.S5
-rw-r--r--libc/sysdeps/i386/fpu/e_log.S29
-rw-r--r--libc/sysdeps/i386/fpu/e_log10.S11
-rw-r--r--libc/sysdeps/i386/fpu/e_log10f.S11
-rw-r--r--libc/sysdeps/i386/fpu/e_log10l.S11
-rw-r--r--libc/sysdeps/i386/fpu/e_log2.S9
-rw-r--r--libc/sysdeps/i386/fpu/e_log2f.S9
-rw-r--r--libc/sysdeps/i386/fpu/e_log2l.S9
-rw-r--r--libc/sysdeps/i386/fpu/e_logf.S29
-rw-r--r--libc/sysdeps/i386/fpu/e_logl.S29
-rw-r--r--libc/sysdeps/i386/fpu/e_pow.S39
-rw-r--r--libc/sysdeps/i386/fpu/e_powf.S39
-rw-r--r--libc/sysdeps/i386/fpu/e_powl.S39
-rw-r--r--libc/sysdeps/i386/fpu/e_remainder.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_remainderf.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_remainderl.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_scalb.S12
-rw-r--r--libc/sysdeps/i386/fpu/e_scalbf.S12
-rw-r--r--libc/sysdeps/i386/fpu/e_scalbl.S12
-rw-r--r--libc/sysdeps/i386/fpu/e_sqrt.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_sqrtf.S3
-rw-r--r--libc/sysdeps/i386/fpu/e_sqrtl.c2
-rw-r--r--libc/sysdeps/i386/fpu/libm-test-ulps54
-rw-r--r--libc/sysdeps/i386/i686/fpu/e_log.S29
-rw-r--r--libc/sysdeps/i386/i686/fpu/e_logf.S30
-rw-r--r--libc/sysdeps/i386/i686/fpu/e_logl.S81
-rw-r--r--libc/sysdeps/i386/i686/multiarch/Makefile6
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S497
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memchr-sse2.S706
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memchr.S99
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S396
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S565
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S927
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memrchr-c.c7
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S418
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S725
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memrchr.S79
-rw-r--r--libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/rawmemchr.S99
-rw-r--r--libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c1
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strchr-sse2.S2
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strlen-sse2.S442
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strnlen-c.c8
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strnlen.S56
-rw-r--r--libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S2
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S148
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wcslen-c.c5
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S194
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wcslen.S56
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c5
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/wmemcmp.S59
-rw-r--r--libc/sysdeps/i386/sysdep.h4
90 files changed, 5446 insertions, 819 deletions
diff --git a/libc/sysdeps/i386/bits/select.h b/libc/sysdeps/i386/bits/select.h
index ab9aa3d10..9e4c56aa8 100644
--- a/libc/sysdeps/i386/bits/select.h
+++ b/libc/sysdeps/i386/bits/select.h
@@ -48,8 +48,8 @@
#endif /* GNU CC */
#define __FD_SET(d, set) \
- ((void) (__FDS_BITS (set)[__FDELT (d)] |= __FDMASK (d)))
+ ((void) (__FDS_BITS (set)[__FD_ELT (d)] |= __FD_MASK (d)))
#define __FD_CLR(d, set) \
- ((void) (__FDS_BITS (set)[__FDELT (d)] &= ~__FDMASK (d)))
+ ((void) (__FDS_BITS (set)[__FD_ELT (d)] &= ~__FD_MASK (d)))
#define __FD_ISSET(d, set) \
- ((__FDS_BITS (set)[__FDELT (d)] & __FDMASK (d)) != 0)
+ ((__FDS_BITS (set)[__FD_ELT (d)] & __FD_MASK (d)) != 0)
diff --git a/libc/sysdeps/i386/configure b/libc/sysdeps/i386/configure
index e8a7970e3..cd4e627ea 100755
--- a/libc/sysdeps/i386/configure
+++ b/libc/sysdeps/i386/configure
@@ -682,6 +682,31 @@ _ACEOF
fi
+{ $as_echo "$as_me:$LINENO: checking for FMA4 support" >&5
+$as_echo_n "checking for FMA4 support... " >&6; }
+if test "${libc_cv_cc_fma4+set}" = set; then
+ $as_echo_n "(cached) " >&6
+else
+ if { ac_try='${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ libc_cv_cc_fma4=yes
+else
+ libc_cv_cc_fma4=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_fma4" >&5
+$as_echo "$libc_cv_cc_fma4" >&6; }
+if test $libc_cv_cc_fma4 = yes; then
+ cat >>confdefs.h <<\_ACEOF
+#define HAVE_FMA4_SUPPORT 1
+_ACEOF
+
+fi
+
{ $as_echo "$as_me:$LINENO: checking for -mno-vzeroupper support" >&5
$as_echo_n "checking for -mno-vzeroupper support... " >&6; }
if test "${libc_cv_cc_novzeroupper+set}" = set; then
diff --git a/libc/sysdeps/i386/configure.in b/libc/sysdeps/i386/configure.in
index 67fd1d7df..5a9840e16 100644
--- a/libc/sysdeps/i386/configure.in
+++ b/libc/sysdeps/i386/configure.in
@@ -67,6 +67,17 @@ if test $libc_cv_cc_avx = yes; then
AC_DEFINE(HAVE_AVX_SUPPORT)
fi
+dnl Check if -mfma4 works.
+AC_CACHE_CHECK(for FMA4 support, libc_cv_cc_fma4, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null]); then
+ libc_cv_cc_fma4=yes
+else
+ libc_cv_cc_fma4=no
+fi])
+if test $libc_cv_cc_fma4 = yes; then
+ AC_DEFINE(HAVE_FMA4_SUPPORT)
+fi
+
dnl Check if -mno-vzeroupper works.
AC_CACHE_CHECK(for -mno-vzeroupper support, libc_cv_cc_novzeroupper, [dnl
if AC_TRY_COMMAND([${CC-cc} -mno-vzeroupper -xc /dev/null -S -o /dev/null]); then
diff --git a/libc/sysdeps/i386/dl-machine.h b/libc/sysdeps/i386/dl-machine.h
index a093d2b15..9469a2b5d 100644
--- a/libc/sysdeps/i386/dl-machine.h
+++ b/libc/sysdeps/i386/dl-machine.h
@@ -1,5 +1,5 @@
/* Machine-dependent ELF dynamic relocation inline functions. i386 version.
- Copyright (C) 1995-2005, 2006, 2009 Free Software Foundation, Inc.
+ Copyright (C) 1995-2005, 2006, 2009, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -243,18 +243,12 @@ _dl_start_user:\n\
define the value.
ELF_RTYPE_CLASS_NOCOPY iff TYPE should not be allowed to resolve to one
of the main executable's symbols, as for a COPY reloc. */
-#if !defined RTLD_BOOTSTRAP || USE___THREAD
# define elf_machine_type_class(type) \
((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32 \
|| (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32 \
|| (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC) \
* ELF_RTYPE_CLASS_PLT) \
| (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY))
-#else
-# define elf_machine_type_class(type) \
- ((((type) == R_386_JMP_SLOT) * ELF_RTYPE_CLASS_PLT) \
- | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY))
-#endif
/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */
#define ELF_MACHINE_JMP_SLOT R_386_JMP_SLOT
@@ -311,7 +305,7 @@ auto inline void
__attribute ((always_inline))
elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
const Elf32_Sym *sym, const struct r_found_version *version,
- void *const reloc_addr_arg)
+ void *const reloc_addr_arg, int skip_ifunc)
{
Elf32_Addr *const reloc_addr = reloc_addr_arg;
const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -347,7 +341,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
if (sym != NULL
&& __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
0)
- && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+ && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+ && __builtin_expect (!skip_ifunc, 1))
value = ((Elf32_Addr (*) (void)) value) ();
switch (r_type)
@@ -357,44 +352,43 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
*reloc_addr = value;
break;
-# if !defined RTLD_BOOTSTRAP || USE___THREAD
case R_386_TLS_DTPMOD32:
-# ifdef RTLD_BOOTSTRAP
+# ifdef RTLD_BOOTSTRAP
/* During startup the dynamic linker is always the module
with index 1.
XXX If this relocation is necessary move before RESOLVE
call. */
*reloc_addr = 1;
-# else
+# else
/* Get the information from the link map returned by the
resolv function. */
if (sym_map != NULL)
*reloc_addr = sym_map->l_tls_modid;
-# endif
+# endif
break;
case R_386_TLS_DTPOFF32:
-# ifndef RTLD_BOOTSTRAP
+# ifndef RTLD_BOOTSTRAP
/* During relocation all TLS symbols are defined and used.
Therefore the offset is already correct. */
if (sym != NULL)
*reloc_addr = sym->st_value;
-# endif
+# endif
break;
case R_386_TLS_DESC:
{
struct tlsdesc volatile *td =
(struct tlsdesc volatile *)reloc_addr;
-# ifndef RTLD_BOOTSTRAP
+# ifndef RTLD_BOOTSTRAP
if (! sym)
td->entry = _dl_tlsdesc_undefweak;
else
-# endif
+# endif
{
-# ifndef RTLD_BOOTSTRAP
-# ifndef SHARED
+# ifndef RTLD_BOOTSTRAP
+# ifndef SHARED
CHECK_STATIC_TLS (map, sym_map);
-# else
+# else
if (!TRY_STATIC_TLS (map, sym_map))
{
td->arg = _dl_make_tlsdesc_dynamic
@@ -402,8 +396,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
td->entry = _dl_tlsdesc_dynamic;
}
else
-# endif
# endif
+# endif
{
td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
+ (ElfW(Word))td->arg);
@@ -426,13 +420,13 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
CHECK_STATIC_TLS (map, sym_map);
*reloc_addr += sym_map->l_tls_offset - sym->st_value;
}
-# endif
+# endif
break;
case R_386_TLS_TPOFF:
/* The offset is negative, forward from the thread pointer. */
-# ifdef RTLD_BOOTSTRAP
+# ifdef RTLD_BOOTSTRAP
*reloc_addr += sym->st_value - map->l_tls_offset;
-# else
+# else
/* We know the offset of object the symbol is contained in.
It is a negative value which will be added to the
thread pointer. */
@@ -441,9 +435,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
CHECK_STATIC_TLS (map, sym_map);
*reloc_addr += sym->st_value - sym_map->l_tls_offset;
}
-# endif
+# endif
break;
-# endif /* use TLS */
# ifndef RTLD_BOOTSTRAP
case R_386_32:
@@ -490,7 +483,7 @@ auto inline void
__attribute__ ((always_inline))
elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
const Elf32_Sym *sym, const struct r_found_version *version,
- void *const reloc_addr_arg)
+ void *const reloc_addr_arg, int skip_ifunc)
{
Elf32_Addr *const reloc_addr = reloc_addr_arg;
const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -507,8 +500,8 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
if (sym != NULL
&& __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
- && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
- 0))
+ && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+ && __builtin_expect (!skip_ifunc, 1))
value = ((Elf32_Addr (*) (void)) value) ();
switch (ELF32_R_TYPE (reloc->r_info))
@@ -655,7 +648,8 @@ elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
auto inline void
__attribute__ ((always_inline))
elf_machine_lazy_rel (struct link_map *map,
- Elf32_Addr l_addr, const Elf32_Rel *reloc)
+ Elf32_Addr l_addr, const Elf32_Rel *reloc,
+ int skip_ifunc)
{
Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -706,19 +700,20 @@ elf_machine_lazy_rel (struct link_map *map,
ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff;
elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)],
&map->l_versions[ndx],
- (void *) (l_addr + r->r_offset));
+ (void *) (l_addr + r->r_offset), skip_ifunc);
}
# ifndef RTLD_BOOTSTRAP
else
elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL,
- (void *) (l_addr + r->r_offset));
+ (void *) (l_addr + r->r_offset), skip_ifunc);
# endif
}
}
else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
{
Elf32_Addr value = map->l_addr + *reloc_addr;
- value = ((Elf32_Addr (*) (void)) value) ();
+ if (__builtin_expect (!skip_ifunc, 1))
+ value = ((Elf32_Addr (*) (void)) value) ();
*reloc_addr = value;
}
else
@@ -730,7 +725,8 @@ elf_machine_lazy_rel (struct link_map *map,
auto inline void
__attribute__ ((always_inline))
elf_machine_lazy_rela (struct link_map *map,
- Elf32_Addr l_addr, const Elf32_Rela *reloc)
+ Elf32_Addr l_addr, const Elf32_Rela *reloc,
+ int skip_ifunc)
{
Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -747,7 +743,8 @@ elf_machine_lazy_rela (struct link_map *map,
else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
{
Elf32_Addr value = map->l_addr + reloc->r_addend;
- value = ((Elf32_Addr (*) (void)) value) ();
+ if (__builtin_expect (!skip_ifunc, 1))
+ value = ((Elf32_Addr (*) (void)) value) ();
*reloc_addr = value;
}
else
diff --git a/libc/sysdeps/i386/elf/configure b/libc/sysdeps/i386/elf/configure
index 7a909d9a5..88edda0a1 100755
--- a/libc/sysdeps/i386/elf/configure
+++ b/libc/sysdeps/i386/elf/configure
@@ -1,7 +1,6 @@
# This file is generated from configure.in by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/i386/elf.
-if test "$usetls" != no; then
# Check for support of thread-local storage handling in assembler and
# linker.
{ $as_echo "$as_me:$LINENO: checking for i386 TLS support" >&5
@@ -39,12 +38,10 @@ rm -f conftest*
fi
{ $as_echo "$as_me:$LINENO: result: $libc_cv_386_tls" >&5
$as_echo "$libc_cv_386_tls" >&6; }
-if test $libc_cv_386_tls = yes; then
- cat >>confdefs.h <<\_ACEOF
-#define HAVE_TLS_SUPPORT 1
-_ACEOF
-
-fi
+if test $libc_cv_386_tls = no; then
+ { { $as_echo "$as_me:$LINENO: error: the assembler must support TLS" >&5
+$as_echo "$as_me: error: the assembler must support TLS" >&2;}
+ { (exit 1); exit 1; }; }
fi
cat >>confdefs.h <<\_ACEOF
diff --git a/libc/sysdeps/i386/elf/configure.in b/libc/sysdeps/i386/elf/configure.in
index ca607adeb..0c436f3f4 100644
--- a/libc/sysdeps/i386/elf/configure.in
+++ b/libc/sysdeps/i386/elf/configure.in
@@ -1,7 +1,6 @@
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
# Local configure fragment for sysdeps/i386/elf.
-if test "$usetls" != no; then
# Check for support of thread-local storage handling in assembler and
# linker.
AC_CACHE_CHECK(for i386 TLS support, libc_cv_386_tls, [dnl
@@ -28,9 +27,8 @@ else
libc_cv_386_tls=no
fi
rm -f conftest*])
-if test $libc_cv_386_tls = yes; then
- AC_DEFINE(HAVE_TLS_SUPPORT)
-fi
+if test $libc_cv_386_tls = no; then
+ AC_MSG_ERROR([the assembler must support TLS])
fi
dnl It is always possible to access static and hidden symbols in an
diff --git a/libc/sysdeps/i386/fpu/e_acos.S b/libc/sysdeps/i386/fpu/e_acos.S
index b9d07b109..d3505baf0 100644
--- a/libc/sysdeps/i386/fpu/e_acos.S
+++ b/libc/sysdeps/i386/fpu/e_acos.S
@@ -19,3 +19,4 @@ ENTRY(__ieee754_acos)
fpatan /* atan (sqrt(1 - x^2) / x) */
ret
END (__ieee754_acos)
+strong_alias (__ieee754_acos, __acos_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosf.S b/libc/sysdeps/i386/fpu/e_acosf.S
index 50b13fd1b..6a843a51d 100644
--- a/libc/sysdeps/i386/fpu/e_acosf.S
+++ b/libc/sysdeps/i386/fpu/e_acosf.S
@@ -20,3 +20,4 @@ ENTRY(__ieee754_acosf)
fpatan
ret
END (__ieee754_acosf)
+strong_alias (__ieee754_acosf, __acosf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosh.S b/libc/sysdeps/i386/fpu/e_acosh.S
index 62a232471..fc65c295c 100644
--- a/libc/sysdeps/i386/fpu/e_acosh.S
+++ b/libc/sysdeps/i386/fpu/e_acosh.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arcsinh.
- Copyright (C) 1996, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996, 2005, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -21,12 +21,12 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -101,3 +101,4 @@ ENTRY(__ieee754_acosh)
fdiv %st, %st(0)
ret
END(__ieee754_acosh)
+strong_alias (__ieee754_acosh, __acosh_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acoshf.S b/libc/sysdeps/i386/fpu/e_acoshf.S
index 1906c6057..b55004b62 100644
--- a/libc/sysdeps/i386/fpu/e_acoshf.S
+++ b/libc/sysdeps/i386/fpu/e_acoshf.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arcsinh.
- Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -21,12 +21,12 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -101,3 +101,4 @@ ENTRY(__ieee754_acoshf)
fdiv %st, %st(0)
ret
END(__ieee754_acoshf)
+strong_alias (__ieee754_acoshf, __acoshf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acoshl.S b/libc/sysdeps/i386/fpu/e_acoshl.S
index c7b548d25..76bc0d752 100644
--- a/libc/sysdeps/i386/fpu/e_acoshl.S
+++ b/libc/sysdeps/i386/fpu/e_acoshl.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arcsinh.
- Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -21,12 +21,12 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
/* Please note that we use double value for 1.0. This number
has an exact representation and so we don't get accuracy
problems. The advantage is that the code is simpler. */
@@ -108,3 +108,4 @@ ENTRY(__ieee754_acoshl)
fdiv %st, %st(0)
ret
END(__ieee754_acoshl)
+strong_alias (__ieee754_acoshl, __acoshl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosl.c b/libc/sysdeps/i386/fpu/e_acosl.c
index 0c3e03945..ec516ffca 100644
--- a/libc/sysdeps/i386/fpu/e_acosl.c
+++ b/libc/sysdeps/i386/fpu/e_acosl.c
@@ -23,3 +23,4 @@ __ieee754_acosl (long double x)
: "=t" (res) : "0" (x) : "st(1)");
return res;
}
+strong_alias (__ieee754_acosl, __acosl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_asin.S b/libc/sysdeps/i386/fpu/e_asin.S
index 945e30824..a17e922b6 100644
--- a/libc/sysdeps/i386/fpu/e_asin.S
+++ b/libc/sysdeps/i386/fpu/e_asin.S
@@ -18,3 +18,4 @@ ENTRY(__ieee754_asin)
fpatan
ret
END (__ieee754_asin)
+strong_alias (__ieee754_asin, __asin_finite)
diff --git a/libc/sysdeps/i386/fpu/e_asinf.S b/libc/sysdeps/i386/fpu/e_asinf.S
index d450e9a74..5c1065dd4 100644
--- a/libc/sysdeps/i386/fpu/e_asinf.S
+++ b/libc/sysdeps/i386/fpu/e_asinf.S
@@ -19,3 +19,4 @@ ENTRY(__ieee754_asinf)
fpatan
ret
END (__ieee754_asinf)
+strong_alias (__ieee754_asinf, __asinf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2.S b/libc/sysdeps/i386/fpu/e_atan2.S
index 8df04e485..e76f8e2a7 100644
--- a/libc/sysdeps/i386/fpu/e_atan2.S
+++ b/libc/sysdeps/i386/fpu/e_atan2.S
@@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2)
fpatan
ret
END (__ieee754_atan2)
+strong_alias (__ieee754_atan2, __atan2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2f.S b/libc/sysdeps/i386/fpu/e_atan2f.S
index fc6621f18..9ffa6373b 100644
--- a/libc/sysdeps/i386/fpu/e_atan2f.S
+++ b/libc/sysdeps/i386/fpu/e_atan2f.S
@@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2f)
fpatan
ret
END (__ieee754_atan2f)
+strong_alias (__ieee754_atan2f, __atan2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2l.c b/libc/sysdeps/i386/fpu/e_atan2l.c
index 19a2a6062..9f88bfcc0 100644
--- a/libc/sysdeps/i386/fpu/e_atan2l.c
+++ b/libc/sysdeps/i386/fpu/e_atan2l.c
@@ -16,3 +16,4 @@ __ieee754_atan2l (long double y, long double x)
return res;
}
+strong_alias (__ieee754_atan2l, __atan2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanh.S b/libc/sysdeps/i386/fpu/e_atanh.S
index 3566ec6ef..d7e53a288 100644
--- a/libc/sysdeps/i386/fpu/e_atanh.S
+++ b/libc/sysdeps/i386/fpu/e_atanh.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arctanh function.
- Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -114,3 +114,4 @@ ENTRY(__ieee754_atanh)
6: fldl 4(%esp)
ret
END(__ieee754_atanh)
+strong_alias (__ieee754_atanh, __atanh_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanhf.S b/libc/sysdeps/i386/fpu/e_atanhf.S
index 10ce6aed9..00ad9142f 100644
--- a/libc/sysdeps/i386/fpu/e_atanhf.S
+++ b/libc/sysdeps/i386/fpu/e_atanhf.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arctanh function.
- Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -107,3 +107,4 @@ ENTRY(__ieee754_atanhf)
5: flds 4(%esp)
ret
END(__ieee754_atanhf)
+strong_alias (__ieee754_atanhf, __atanhf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanhl.S b/libc/sysdeps/i386/fpu/e_atanhl.S
index 8618c3fb3..cc70e73f4 100644
--- a/libc/sysdeps/i386/fpu/e_atanhl.S
+++ b/libc/sysdeps/i386/fpu/e_atanhl.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of arctanh function.
- Copyright (C) 1996, 1999 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1999, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -118,3 +118,4 @@ ENTRY(__ieee754_atanhl)
6: fldt 4(%esp)
ret
END(__ieee754_atanhl)
+strong_alias (__ieee754_atanhl, __atanhl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp.S b/libc/sysdeps/i386/fpu/e_exp.S
index 4a75fa1d1..2c331d9ed 100644
--- a/libc/sysdeps/i386/fpu/e_exp.S
+++ b/libc/sysdeps/i386/fpu/e_exp.S
@@ -5,7 +5,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_exp.S,v 1.7 1996/07/03 17:31:28 jtc Exp $")
/* e^x = 2^(x * log2(e)) */
ENTRY(__ieee754_exp)
@@ -39,3 +38,19 @@ ENTRY(__ieee754_exp)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp)
+
+
+ENTRY(__exp_finite)
+ fldl2e
+ fmull 4(%esp) /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ ret
+END(__exp_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10.S b/libc/sysdeps/i386/fpu/e_exp10.S
index 6bfcdbb72..1e32b0784 100644
--- a/libc/sysdeps/i386/fpu/e_exp10.S
+++ b/libc/sysdeps/i386/fpu/e_exp10.S
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp10)
+strong_alias (__ieee754_exp10, __exp10_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10f.S b/libc/sysdeps/i386/fpu/e_exp10f.S
index 4791b99af..614496415 100644
--- a/libc/sysdeps/i386/fpu/e_exp10f.S
+++ b/libc/sysdeps/i386/fpu/e_exp10f.S
@@ -4,7 +4,7 @@
#include <machine/asm.h>
-/* e^x = 2^(x * log2(10)) */
+/* 10^x = 2^(x * log2(10)) */
ENTRY(__ieee754_exp10f)
flds 4(%esp)
/* I added the following ugly construct because exp(+-Inf) resulted
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10f)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp10f)
+strong_alias (__ieee754_exp10f, __exp10f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10l.S b/libc/sysdeps/i386/fpu/e_exp10l.S
index 71f0da792..04ec8001d 100644
--- a/libc/sysdeps/i386/fpu/e_exp10l.S
+++ b/libc/sysdeps/i386/fpu/e_exp10l.S
@@ -4,7 +4,7 @@
#include <machine/asm.h>
-/* e^x = 2^(x * log2l(10)) */
+/* 10^x = 2^(x * log2l(10)) */
ENTRY(__ieee754_exp10l)
fldt 4(%esp)
/* I added the following ugly construct because expl(+-Inf) resulted
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10l)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp10l)
+strong_alias (__ieee754_exp10l, __exp10l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2.S b/libc/sysdeps/i386/fpu/e_exp2.S
index 778c0c0eb..f802cf8b9 100644
--- a/libc/sysdeps/i386/fpu/e_exp2.S
+++ b/libc/sysdeps/i386/fpu/e_exp2.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp2)
+strong_alias (__ieee754_exp2, __exp2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2f.S b/libc/sysdeps/i386/fpu/e_exp2f.S
index c2d1af1af..f867d0d47 100644
--- a/libc/sysdeps/i386/fpu/e_exp2f.S
+++ b/libc/sysdeps/i386/fpu/e_exp2f.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2f)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp2f)
+strong_alias (__ieee754_exp2f, __exp2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2l.S b/libc/sysdeps/i386/fpu/e_exp2l.S
index fa1fdc923..203dd0078 100644
--- a/libc/sysdeps/i386/fpu/e_exp2l.S
+++ b/libc/sysdeps/i386/fpu/e_exp2l.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2l)
fldz /* Set result to 0. */
2: ret
END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_expf.S b/libc/sysdeps/i386/fpu/e_expf.S
index 5fd49b89f..4e4f6a0df 100644
--- a/libc/sysdeps/i386/fpu/e_expf.S
+++ b/libc/sysdeps/i386/fpu/e_expf.S
@@ -6,7 +6,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
/* e^x = 2^(x * log2(e)) */
ENTRY(__ieee754_expf)
@@ -40,3 +39,19 @@ ENTRY(__ieee754_expf)
fldz /* Set result to 0. */
2: ret
END (__ieee754_expf)
+
+
+ENTRY(__expf_finite)
+ fldl2e
+ fmuls 4(%esp) /* x * log2(e) */
+ fld %st
+ frndint /* int(x * log2(e)) */
+ fsubr %st,%st(1) /* fract(x * log2(e)) */
+ fxch
+ f2xm1 /* 2^(fract(x * log2(e))) - 1 */
+ fld1
+ faddp /* 2^(fract(x * log2(e))) */
+ fscale /* e^x */
+ fstp %st(1)
+ ret
+END(__expf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_expl.c b/libc/sysdeps/i386/fpu/e_expl.c
index 2240ceac4..8dc9581f7 100644
--- a/libc/sysdeps/i386/fpu/e_expl.c
+++ b/libc/sysdeps/i386/fpu/e_expl.c
@@ -63,7 +63,7 @@ __ieee754_expl (long double x)
"fld1\n\t" /* 4 1.0 */
"faddp\n\t" /* 3 2^(fract(x * log2(e))) */
"fstp %%st(1)\n\t" /* 2 */
- "fscale\n\t" /* 2 scale factor is st(1); e^x */
+ "fscale\n\t" /* 2 scale factor is st(1); e^x */
"fstp %%st(1)\n\t" /* 1 */
"fstp %%st(1)\n\t" /* 0 */
"jmp 2f\n\t"
@@ -75,3 +75,4 @@ __ieee754_expl (long double x)
: "=t" (res) : "0" (x), "m" (c0), "m" (c1) : "ax", "dx");
return res;
}
+strong_alias (__ieee754_expl, __expl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmod.S b/libc/sysdeps/i386/fpu/e_fmod.S
index 4cf6e9205..26b3acc39 100644
--- a/libc/sysdeps/i386/fpu/e_fmod.S
+++ b/libc/sysdeps/i386/fpu/e_fmod.S
@@ -5,8 +5,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_fmod.S,v 1.4 1995/05/08 23:47:56 jtc Exp $")
-
ENTRY(__ieee754_fmod)
fldl 12(%esp)
fldl 4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_fmod)
fstp %st(1)
ret
END (__ieee754_fmod)
+strong_alias (__ieee754_fmod, __fmod_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmodf.S b/libc/sysdeps/i386/fpu/e_fmodf.S
index bbce40976..ece4d9842 100644
--- a/libc/sysdeps/i386/fpu/e_fmodf.S
+++ b/libc/sysdeps/i386/fpu/e_fmodf.S
@@ -6,8 +6,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
ENTRY(__ieee754_fmodf)
flds 8(%esp)
flds 4(%esp)
@@ -18,3 +16,4 @@ ENTRY(__ieee754_fmodf)
fstp %st(1)
ret
END(__ieee754_fmodf)
+strong_alias (__ieee754_fmodf, __fmodf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmodl.c b/libc/sysdeps/i386/fpu/e_fmodl.c
index c7c9a6045..49700ae8f 100644
--- a/libc/sysdeps/i386/fpu/e_fmodl.c
+++ b/libc/sysdeps/i386/fpu/e_fmodl.c
@@ -20,3 +20,4 @@ __ieee754_fmodl (long double x, long double y)
: "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)");
return res;
}
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_hypot.S b/libc/sysdeps/i386/fpu/e_hypot.S
index 043585730..0baa011d1 100644
--- a/libc/sysdeps/i386/fpu/e_hypot.S
+++ b/libc/sysdeps/i386/fpu/e_hypot.S
@@ -1,5 +1,5 @@
/* Compute the hypothenuse of X and Y.
- Copyright (C) 1998 Free Software Foundation, Inc.
+ Copyright (C) 1998, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
@@ -58,5 +58,6 @@ ENTRY(__ieee754_hypot)
fxch
5: fstp %st(1)
jmp 2b
-
+
END(__ieee754_hypot)
+strong_alias (__ieee754_hypot, __hypot_finite)
diff --git a/libc/sysdeps/i386/fpu/e_hypotf.S b/libc/sysdeps/i386/fpu/e_hypotf.S
index 5967dae21..eb95d6ee9 100644
--- a/libc/sysdeps/i386/fpu/e_hypotf.S
+++ b/libc/sysdeps/i386/fpu/e_hypotf.S
@@ -1,5 +1,5 @@
/* Compute the hypothenuse of X and Y.
- Copyright (C) 1998 Free Software Foundation, Inc.
+ Copyright (C) 1998, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
@@ -58,5 +58,6 @@ ENTRY(__ieee754_hypotf)
fxch
5: fstp %st(1)
jmp 2b
-
+
END(__ieee754_hypotf)
+strong_alias (__ieee754_hypotf, __hypotf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log.S b/libc/sysdeps/i386/fpu/e_log.S
index ce55b7229..a2e4d89a4 100644
--- a/libc/sysdeps/i386/fpu/e_log.S
+++ b/libc/sysdeps/i386/fpu/e_log.S
@@ -7,14 +7,12 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -27,9 +25,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -64,3 +62,22 @@ ENTRY(__ieee754_log)
fstp %st(1)
ret
END (__ieee754_log)
+
+ENTRY(__log_finite)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__log_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10.S b/libc/sysdeps/i386/fpu/e_log10.S
index 525f08c96..9d24d7402 100644
--- a/libc/sysdeps/i386/fpu/e_log10.S
+++ b/libc/sysdeps/i386/fpu/e_log10.S
@@ -7,14 +7,12 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_log10.S,v 1.4 1995/05/08 23:49:24 jtc Exp $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -27,9 +25,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -64,3 +62,4 @@ ENTRY(__ieee754_log10)
fstp %st(1)
ret
END (__ieee754_log10)
+strong_alias (__ieee754_log10, __log10_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10f.S b/libc/sysdeps/i386/fpu/e_log10f.S
index da5069d58..38a4833d1 100644
--- a/libc/sysdeps/i386/fpu/e_log10f.S
+++ b/libc/sysdeps/i386/fpu/e_log10f.S
@@ -8,14 +8,12 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -65,3 +63,4 @@ ENTRY(__ieee754_log10f)
fstp %st(1)
ret
END (__ieee754_log10f)
+strong_alias (__ieee754_log10f, __log10f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10l.S b/libc/sysdeps/i386/fpu/e_log10l.S
index 3811516be..88b309d53 100644
--- a/libc/sysdeps/i386/fpu/e_log10l.S
+++ b/libc/sysdeps/i386/fpu/e_log10l.S
@@ -9,14 +9,12 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -29,9 +27,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -66,3 +64,4 @@ ENTRY(__ieee754_log10l)
fstp %st(1)
ret
END(__ieee754_log10l)
+strong_alias (__ieee754_log10l, __log10l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2.S b/libc/sysdeps/i386/fpu/e_log2.S
index d80bf8023..88aee7f3c 100644
--- a/libc/sysdeps/i386/fpu/e_log2.S
+++ b/libc/sysdeps/i386/fpu/e_log2.S
@@ -9,11 +9,11 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2)
fstp %st(1)
ret
END (__ieee754_log2)
+strong_alias (__ieee754_log2, __log2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2f.S b/libc/sysdeps/i386/fpu/e_log2f.S
index 9eb7b2a82..20144875f 100644
--- a/libc/sysdeps/i386/fpu/e_log2f.S
+++ b/libc/sysdeps/i386/fpu/e_log2f.S
@@ -9,11 +9,11 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2f)
fstp %st(1)
ret
END (__ieee754_log2f)
+strong_alias (__ieee754_log2f, __log2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2l.S b/libc/sysdeps/i386/fpu/e_log2l.S
index 9de08f5de..bc79dea2d 100644
--- a/libc/sysdeps/i386/fpu/e_log2l.S
+++ b/libc/sysdeps/i386/fpu/e_log2l.S
@@ -9,11 +9,11 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2l)
fstp %st(1)
ret
END (__ieee754_log2l)
+strong_alias (__ieee754_log2l, __log2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_logf.S b/libc/sysdeps/i386/fpu/e_logf.S
index cd4538b59..1992cc2f8 100644
--- a/libc/sysdeps/i386/fpu/e_logf.S
+++ b/libc/sysdeps/i386/fpu/e_logf.S
@@ -8,14 +8,12 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -65,3 +63,22 @@ ENTRY(__ieee754_logf)
fstp %st(1)
ret
END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_logl.S b/libc/sysdeps/i386/fpu/e_logl.S
index 551dcf1e4..bfb72a30e 100644
--- a/libc/sysdeps/i386/fpu/e_logl.S
+++ b/libc/sysdeps/i386/fpu/e_logl.S
@@ -7,15 +7,13 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
- .align ALIGNARG(4)
+ .p2align 3
ASM_TYPE_DIRECTIVE(one,@object)
one: .double 1.0
ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit: .double 0.29
#ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
#else
-#define MO(op) op
+# define MO(op) op
#endif
.text
@@ -65,3 +63,22 @@ ENTRY(__ieee754_logl)
fstp %st(1)
ret
END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fcompl MO(limit) // x-1 : x : log(2)
+ fnstsw // x-1 : x : log(2)
+ andb $0x45, %ah
+ jz 2b
+ fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_pow.S b/libc/sysdeps/i386/fpu/e_pow.S
index 792f92690..dccc67752 100644
--- a/libc/sysdeps/i386/fpu/e_pow.S
+++ b/libc/sysdeps/i386/fpu/e_pow.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of pow function.
- Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007
+ Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011
Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
+ .p2align 3
+ ASM_TYPE_DIRECTIVE(one,@object)
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ ASM_TYPE_DIRECTIVE(limit,@object)
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ ASM_TYPE_DIRECTIVE(p63,@object)
+p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+ ASM_SIZE_DIRECTIVE(p63)
- .align ALIGNARG(4)
+#ifdef __ELF__
+ .section .rodata.cst16,"aM",@progbits,16
+#else
+ .text
+#endif
+ .p2align 3
ASM_TYPE_DIRECTIVE(infinity,@object)
inf_zero:
infinity:
@@ -43,22 +58,13 @@ minfinity:
mzero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
ASM_SIZE_DIRECTIVE(minf_mzero)
- ASM_TYPE_DIRECTIVE(one,@object)
-one: .double 1.0
- ASM_SIZE_DIRECTIVE(one)
- ASM_TYPE_DIRECTIVE(limit,@object)
-limit: .double 0.29
- ASM_SIZE_DIRECTIVE(limit)
- ASM_TYPE_DIRECTIVE(p63,@object)
-p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
- ASM_SIZE_DIRECTIVE(p63)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
.text
@@ -360,3 +366,4 @@ ENTRY(__ieee754_pow)
ret
END(__ieee754_pow)
+strong_alias (__ieee754_pow, __pow_finite)
diff --git a/libc/sysdeps/i386/fpu/e_powf.S b/libc/sysdeps/i386/fpu/e_powf.S
index c91545418..99c95bbdf 100644
--- a/libc/sysdeps/i386/fpu/e_powf.S
+++ b/libc/sysdeps/i386/fpu/e_powf.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of pow function.
- Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007
+ Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007, 2011
Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
+ .p2align 3
+ ASM_TYPE_DIRECTIVE(one,@object)
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ ASM_TYPE_DIRECTIVE(limit,@object)
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ ASM_TYPE_DIRECTIVE(p31,@object)
+p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+ ASM_SIZE_DIRECTIVE(p31)
- .align ALIGNARG(4)
+#ifdef __ELF__
+ .section .rodata.cst16,"aM",@progbits,16
+#else
+ .text
+#endif
+ .p2align 3
ASM_TYPE_DIRECTIVE(infinity,@object)
inf_zero:
infinity:
@@ -43,22 +58,13 @@ minfinity:
mzero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
ASM_SIZE_DIRECTIVE(minf_mzero)
- ASM_TYPE_DIRECTIVE(one,@object)
-one: .double 1.0
- ASM_SIZE_DIRECTIVE(one)
- ASM_TYPE_DIRECTIVE(limit,@object)
-limit: .double 0.29
- ASM_SIZE_DIRECTIVE(limit)
- ASM_TYPE_DIRECTIVE(p31,@object)
-p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
- ASM_SIZE_DIRECTIVE(p31)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
.text
@@ -348,3 +354,4 @@ ENTRY(__ieee754_powf)
ret
END(__ieee754_powf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_powl.S b/libc/sysdeps/i386/fpu/e_powl.S
index 621549620..34ace3576 100644
--- a/libc/sysdeps/i386/fpu/e_powl.S
+++ b/libc/sysdeps/i386/fpu/e_powl.S
@@ -1,5 +1,5 @@
/* ix87 specific implementation of pow function.
- Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007
+ Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011
Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
#include <machine/asm.h>
#ifdef __ELF__
- .section .rodata
+ .section .rodata.cst8,"aM",@progbits,8
#else
.text
#endif
+ .p2align 3
+ ASM_TYPE_DIRECTIVE(one,@object)
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ ASM_TYPE_DIRECTIVE(limit,@object)
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+ ASM_TYPE_DIRECTIVE(p63,@object)
+p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+ ASM_SIZE_DIRECTIVE(p63)
- .align ALIGNARG(4)
+#ifdef __ELF__
+ .section .rodata.cst16,"aM",@progbits,16
+#else
+ .text
+#endif
+ .p2align 3
ASM_TYPE_DIRECTIVE(infinity,@object)
inf_zero:
infinity:
@@ -43,22 +58,13 @@ minfinity:
mzero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
ASM_SIZE_DIRECTIVE(minf_mzero)
- ASM_TYPE_DIRECTIVE(one,@object)
-one: .double 1.0
- ASM_SIZE_DIRECTIVE(one)
- ASM_TYPE_DIRECTIVE(limit,@object)
-limit: .double 0.29
- ASM_SIZE_DIRECTIVE(limit)
- ASM_TYPE_DIRECTIVE(p63,@object)
-p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
- ASM_SIZE_DIRECTIVE(p63)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
.text
@@ -360,3 +366,4 @@ ENTRY(__ieee754_powl)
ret
END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainder.S b/libc/sysdeps/i386/fpu/e_remainder.S
index 2f43cb894..f7867aa90 100644
--- a/libc/sysdeps/i386/fpu/e_remainder.S
+++ b/libc/sysdeps/i386/fpu/e_remainder.S
@@ -5,8 +5,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_remainder.S,v 1.4 1995/05/08 23:49:37 jtc Exp $")
-
ENTRY(__ieee754_remainder)
fldl 12(%esp)
fldl 4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_remainder)
fstp %st(1)
ret
END (__ieee754_remainder)
+strong_alias (__ieee754_remainder, __remainder_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainderf.S b/libc/sysdeps/i386/fpu/e_remainderf.S
index 79f821993..cfd390bc6 100644
--- a/libc/sysdeps/i386/fpu/e_remainderf.S
+++ b/libc/sysdeps/i386/fpu/e_remainderf.S
@@ -5,8 +5,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_remainderf.S,v 1.2 1995/05/08 23:49:47 jtc Exp $")
-
ENTRY(__ieee754_remainderf)
flds 8(%esp)
flds 4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_remainderf)
fstp %st(1)
ret
END (__ieee754_remainderf)
+strong_alias (__ieee754_remainderf, __remainderf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainderl.S b/libc/sysdeps/i386/fpu/e_remainderl.S
index 5f50b626a..5ec23a37a 100644
--- a/libc/sysdeps/i386/fpu/e_remainderl.S
+++ b/libc/sysdeps/i386/fpu/e_remainderl.S
@@ -7,8 +7,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
ENTRY(__ieee754_remainderl)
fldt 16(%esp)
fldt 4(%esp)
@@ -19,3 +17,4 @@ ENTRY(__ieee754_remainderl)
fstp %st(1)
ret
END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalb.S b/libc/sysdeps/i386/fpu/e_scalb.S
index 7e334a361..0f3ec9619 100644
--- a/libc/sysdeps/i386/fpu/e_scalb.S
+++ b/libc/sysdeps/i386/fpu/e_scalb.S
@@ -7,8 +7,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $")
-
#ifdef __ELF__
.section .rodata
#else
@@ -20,18 +18,17 @@ RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $")
zero_nan:
.double 0.0
nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
ASM_SIZE_DIRECTIVE(zero_nan)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
.text
@@ -100,3 +97,4 @@ ENTRY(__ieee754_scalb)
fdiv %st
ret
END(__ieee754_scalb)
+strong_alias (__ieee754_scalb, __scalb_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalbf.S b/libc/sysdeps/i386/fpu/e_scalbf.S
index e99ee92cb..d11ca66d1 100644
--- a/libc/sysdeps/i386/fpu/e_scalbf.S
+++ b/libc/sysdeps/i386/fpu/e_scalbf.S
@@ -8,8 +8,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
#ifdef __ELF__
.section .rodata
#else
@@ -21,18 +19,17 @@ RCSID("$NetBSD: $")
zero_nan:
.double 0.0
nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
ASM_SIZE_DIRECTIVE(zero_nan)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
@@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbf)
fdiv %st
ret
END(__ieee754_scalbf)
+strong_alias (__ieee754_scalbf, __scalbf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalbl.S b/libc/sysdeps/i386/fpu/e_scalbl.S
index 3f67d0bef..d8b216971 100644
--- a/libc/sysdeps/i386/fpu/e_scalbl.S
+++ b/libc/sysdeps/i386/fpu/e_scalbl.S
@@ -9,8 +9,6 @@
#include <machine/asm.h>
-RCSID("$NetBSD: $")
-
#ifdef __ELF__
.section .rodata
#else
@@ -22,18 +20,17 @@ RCSID("$NetBSD: $")
zero_nan:
.double 0.0
nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
.byte 0, 0, 0, 0, 0, 0, 0, 0x80
.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
ASM_SIZE_DIRECTIVE(zero_nan)
#ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
#else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
#endif
.text
@@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbl)
fdiv %st
ret
END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrt.S b/libc/sysdeps/i386/fpu/e_sqrt.S
index 6f253d51a..1054ba453 100644
--- a/libc/sysdeps/i386/fpu/e_sqrt.S
+++ b/libc/sysdeps/i386/fpu/e_sqrt.S
@@ -5,10 +5,9 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_sqrt.S,v 1.4 1995/05/08 23:49:57 jtc Exp $")
-
ENTRY(__ieee754_sqrt)
fldl 4(%esp)
fsqrt
ret
END (__ieee754_sqrt)
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrtf.S b/libc/sysdeps/i386/fpu/e_sqrtf.S
index 5ce1ad054..6f7e4b015 100644
--- a/libc/sysdeps/i386/fpu/e_sqrtf.S
+++ b/libc/sysdeps/i386/fpu/e_sqrtf.S
@@ -5,10 +5,9 @@
#include <machine/asm.h>
-RCSID("$NetBSD: e_sqrtf.S,v 1.2 1995/05/08 23:50:14 jtc Exp $")
-
ENTRY(__ieee754_sqrtf)
flds 4(%esp)
fsqrt
ret
END (__ieee754_sqrtf)
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrtl.c b/libc/sysdeps/i386/fpu/e_sqrtl.c
index 85f61bb38..41bcd7eeb 100644
--- a/libc/sysdeps/i386/fpu/e_sqrtl.c
+++ b/libc/sysdeps/i386/fpu/e_sqrtl.c
@@ -7,6 +7,7 @@
#include <math_private.h>
+#undef __ieee754_sqrtl
long double
__ieee754_sqrtl (long double x)
{
@@ -16,3 +17,4 @@ __ieee754_sqrtl (long double x)
return res;
}
+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
diff --git a/libc/sysdeps/i386/fpu/libm-test-ulps b/libc/sysdeps/i386/fpu/libm-test-ulps
index 4b1a9e734..ebd46b0df 100644
--- a/libc/sysdeps/i386/fpu/libm-test-ulps
+++ b/libc/sysdeps/i386/fpu/libm-test-ulps
@@ -640,6 +640,52 @@ double: 1
idouble: 1
ildouble: 1
ldouble: 1
+Test "jn (2, 2.4048255576957729) == 0.43175480701968038399746111312430703":
+float: 1
+ifloat: 1
+double: 1
+idouble: 1
+ldouble: 82
+ildouble: 82
+Test "jn (3, 2.4048255576957729) == 0.19899990535769083404042146764530813":
+ldouble: 186
+ildouble: 186
+Test "jn (4, 2.4048255576957729) == 0.647466661641779720084932282551219891E-1":
+ldouble: 185
+ildouble: 185
+Test: "jn (5, 2.4048255576957729) == 0.163892432048058525099230549946147698E-1":
+float: 1
+ifloat: 1
+double: 1
+idouble: 1
+ldouble: 249
+ildouble: 249
+Test "jn (6, 2.4048255576957729) == 0.34048184720278336646673682895929161E-2":
+float: 2
+ifloat: 2
+double: 1
+idouble: 1
+ldouble: 511
+ildouble: 511
+Test "jn (7, 2.4048255576957729) == 0.60068836573295394221291569249883076E-3":
+float: 2
+ifloat: 2
+double: 1
+idouble: 1
+ldouble: 428
+ildouble: 428
+Test "jn (8, 2.4048255576957729) == 0.92165786705344923232879022467054148E-4":
+float: 3
+ifloat: 3
+double: 1
+idouble: 1
+ldouble: 609
+ildouble: 609
+Test "jn (9, 2.4048255576957729) == 0.12517270977961513005428966643852564E-4":
+float: 4
+ifloat: 4
+ldouble: 750
+ildouble: 750
# lgamma
Test "lgamma (-0.5) == log(2*sqrt(pi))":
@@ -1168,11 +1214,11 @@ ldouble: 1
Function: "jn":
double: 5
-float: 2
+float: 4
idouble: 5
-ifloat: 2
-ildouble: 2
-ldouble: 2
+ifloat: 4
+ildouble: 750
+ldouble: 750
Function: "lgamma":
double: 1
diff --git a/libc/sysdeps/i386/i686/fpu/e_log.S b/libc/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 000000000..73060b088
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_log)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+ fldln2 // log(2)
+ fldl 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__log_finite)
diff --git a/libc/sysdeps/i386/i686/fpu/e_logf.S b/libc/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 000000000..6fd39d50d
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+ .text
+ENTRY(__ieee754_logf)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+ fldln2 // log(2)
+ flds 4(%esp) // x : log(2)
+ fyl2x // log(x)
+ ret
+END(__logf_finite)
diff --git a/libc/sysdeps/i386/i686/fpu/e_logl.S b/libc/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 000000000..4e79a5a4b
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,81 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+#ifdef __ELF__
+ .section .rodata.cst8,"aM",@progbits,8
+#else
+ .text
+#endif
+ .p2align 3
+ ASM_TYPE_DIRECTIVE(one,@object)
+one: .double 1.0
+ ASM_SIZE_DIRECTIVE(one)
+ /* It is not important that this constant is precise. It is only
+ a value which is known to be on the safe side for using the
+ fyl2xp1 instruction. */
+ ASM_TYPE_DIRECTIVE(limit,@object)
+limit: .double 0.29
+ ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+ .text
+ENTRY(__ieee754_logl)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+ fucomi %st
+ jp 3f
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2f
+ fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+
+2: fstp %st(0) // x : log(2)
+ fyl2x // log(x)
+ ret
+
+3: fstp %st(1)
+ ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+ fldln2 // log(2)
+ fldt 4(%esp) // x : log(2)
+#ifdef PIC
+ LOAD_PIC_REG (dx)
+#endif
+ fld %st // x : x : log(2)
+ fsubl MO(one) // x-1 : x : log(2)
+ fld %st // x-1 : x-1 : x : log(2)
+ fabs // |x-1| : x-1 : x : log(2)
+ fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2)
+ fcomip %st(1) // |x-1| : x-1 : x : log(2)
+ fstp %st(0) // x-1 : x : log(2)
+ jc 2b
+ fstp %st(1) // x-1 : log(2)
+ fyl2xp1 // log(x)
+ ret
+END(__logl_finite)
diff --git a/libc/sysdeps/i386/i686/multiarch/Makefile b/libc/sysdeps/i386/i686/multiarch/Makefile
index c89ae9247..5f1853877 100644
--- a/libc/sysdeps/i386/i686/multiarch/Makefile
+++ b/libc/sysdeps/i386/i686/multiarch/Makefile
@@ -15,7 +15,11 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
- wcscmp-sse2 wcscmp-c
+ wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
+ memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+ rawmemchr-sse2 rawmemchr-sse2-bsf \
+ strnlen-sse2 strnlen-c wcslen-sse2 wcslen-c \
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 000000000..115a2192a
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,497 @@
+/* Optimized memchr with sse2
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2_bsf
+# endif
+
+ .text
+ENTRY (MEMCHR)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null_1)
+# endif
+ mov %ecx, %eax
+
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+
+ cmp $48, %ecx
+ ja L(crosscache)
+
+ movdqu (%eax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ je L(unaligned_no_match_1)
+/* Check which byte is a match. */
+ bsf %ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %ecx, %edx
+ jbe L(return_null_1)
+# endif
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+ sub $16, %edx
+ jbe L(return_null_1)
+ PUSH (%edi)
+ lea 16(%eax), %edi
+ and $15, %eax
+ and $-16, %edi
+ add %eax, %edx
+# else
+ lea 16(%eax), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(return_null_1):
+ xor %eax, %eax
+ ret
+
+# ifndef USE_AS_RAWMEMCHR
+ CFI_POP (%edi)
+# endif
+
+ .p2align 4
+L(crosscache):
+/* Handle unaligned string. */
+
+# ifndef USE_AS_RAWMEMCHR
+ PUSH (%edi)
+ mov %eax, %edi
+ and $15, %ecx
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ mov %eax, %edx
+ and $15, %ecx
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ sar %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+/* Check which byte is a match. */
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ sub %eax, %edx
+ jbe L(return_null)
+ add %edi, %eax
+ add %ecx, %eax
+ RETURN
+# else
+ add %edx, %eax
+ add %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+ sub $16, %edx
+ add %ecx, %edx
+ jle L(return_null)
+ add $16, %edi
+# else
+ add $16, %edx
+# endif
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ test $0x3f, %edi
+# else
+ test $0x3f, %edx
+# endif
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm3
+# else
+ movdqa 48(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ test %eax, %eax
+ jnz L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+
+ pcmpeqb %xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ pmovmskb %xmm1, %eax
+ bsf %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 48(%edi, %eax), %eax
+ RETURN
+# else
+ lea 48(%edx, %eax), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 16(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ xor %eax, %eax
+ RETURN
+# endif
+ .p2align 4
+L(matches0):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea -16(%eax, %edi), %eax
+ RETURN
+# else
+ lea -16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ add %edi, %eax
+ RETURN
+# else
+ add %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches16):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 16(%eax, %edi), %eax
+ RETURN
+# else
+ lea 16(%eax, %edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(matches32):
+ bsf %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ lea 32(%eax, %edi), %eax
+ RETURN
+# else
+ lea 32(%eax, %edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(matches_1):
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ add %edi, %eax
+ RETURN
+
+ .p2align 4
+L(matches16_1):
+ sub $16, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 16(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches32_1):
+ sub $32, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 32(%edi, %eax), %eax
+ RETURN
+
+ .p2align 4
+L(matches48_1):
+ sub $48, %edx
+ bsf %eax, %eax
+ sub %eax, %edx
+ jbe L(return_null)
+
+ lea 48(%edi, %eax), %eax
+ RETURN
+# endif
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 000000000..63d1d5d7b
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,706 @@
+/* Optimized memchr with sse2 without bsf
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+# define ENTRANCE PUSH(%edi);
+# define PARMS 8
+# define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+# else
+# define ENTRANCE
+# define PARMS 4
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+# define LEN STR2+4
+# endif
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_sse2
+# endif
+
+ atom_text_section
+ENTRY (MEMCHR)
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null)
+# endif
+
+ punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+ mov %ecx, %edi
+# else
+ mov %ecx, %edx
+# endif
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+ cmp $48, %ecx
+ ja L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqu (%edi), %xmm0
+# else
+ movdqu (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog)
+
+ sub $16, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+ and $15, %ecx
+ and $-16, %edi
+ add %ecx, %edx
+# else
+ jnz L(match_case1_prolog)
+ lea 16(%edx), %edx
+ and $-16, %edx
+# endif
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+# else
+ and $-16, %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ sar %cl, %eax
+ test %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(match_case2_prolog1)
+ lea -16(%edx), %edx
+ add %ecx, %edx
+ jle L(return_null)
+ lea 16(%edi), %edi
+# else
+ jnz L(match_case1_prolog1)
+ lea 16(%edx), %edx
+# endif
+
+ .p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+# else
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%edi), %xmm0
+# else
+ lea 64(%edx), %edx
+ movdqa (%edx), %xmm0
+# endif
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 16(%edi), %xmm2
+# else
+ movdqa 16(%edx), %xmm2
+# endif
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ lea 64(%edi), %edi
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+# else
+ lea 64(%edx), %edx
+ and $-64, %edx
+# endif
+
+ .p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+# else
+ movdqa (%edx), %xmm0
+ movdqa 16(%edx), %xmm2
+ movdqa 32(%edx), %xmm3
+ movdqa 48(%edx), %xmm4
+# endif
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+ add $64, %edi
+# else
+ add $64, %edx
+# endif
+ pmovmskb %xmm4, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+ sub $64, %edi
+# else
+ sub $64, %edx
+# endif
+
+ pmovmskb %xmm0, %eax
+ xor %ecx, %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ pmovmskb %xmm2, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ movdqa 32(%edi), %xmm3
+# else
+ movdqa 32(%edx), %xmm3
+# endif
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+ pcmpeqb 48(%edi), %xmm1
+# else
+ pcmpeqb 48(%edx), %xmm1
+# endif
+ pmovmskb %xmm1, %eax
+ lea 16(%ecx), %ecx
+
+ .p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+ add %ecx, %edi
+# else
+L(match_case1_prolog1):
+ add %ecx, %edx
+L(match_case1_prolog):
+# endif
+ test %al, %al
+ jz L(match_case1_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case1_8)
+ test $0x01, %al
+ jnz L(ExitCase1_1)
+ test $0x02, %al
+ jnz L(ExitCase1_2)
+ test $0x04, %al
+ jnz L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+ lea 3(%edi), %eax
+ RETURN
+# else
+ lea 3(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_8):
+ test $0x10, %al
+ jnz L(ExitCase1_5)
+ test $0x20, %al
+ jnz L(ExitCase1_6)
+ test $0x40, %al
+ jnz L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+ lea 7(%edi), %eax
+ RETURN
+# else
+ lea 7(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case1_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase1_9)
+ test $0x02, %ah
+ jnz L(ExitCase1_10)
+ test $0x04, %ah
+ jnz L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+ lea 11(%edi), %eax
+ RETURN
+# else
+ lea 11(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(match_case1_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase1_13)
+ test $0x20, %ah
+ jnz L(ExitCase1_14)
+ test $0x40, %ah
+ jnz L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+ lea 15(%edi), %eax
+ RETURN
+# else
+ lea 15(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $32, %edx
+ jbe L(return_null)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+
+ xor %eax, %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+ mov %edi, %eax
+ RETURN
+# else
+ mov %edx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+ lea 1(%edi), %eax
+ RETURN
+# else
+ lea 1(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+ lea 2(%edi), %eax
+ RETURN
+# else
+ lea 2(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+ lea 4(%edi), %eax
+ RETURN
+# else
+ lea 4(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+ lea 5(%edi), %eax
+ RETURN
+# else
+ lea 5(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+ lea 6(%edi), %eax
+ RETURN
+# else
+ lea 6(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+ lea 8(%edi), %eax
+ RETURN
+# else
+ lea 8(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+ lea 9(%edi), %eax
+ RETURN
+# else
+ lea 9(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+ lea 10(%edi), %eax
+ RETURN
+# else
+ lea 10(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+ lea 12(%edi), %eax
+ RETURN
+# else
+ lea 12(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+ lea 13(%edi), %eax
+ RETURN
+# else
+ lea 13(%edx), %eax
+ ret
+# endif
+
+ .p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+ lea 14(%edi), %eax
+ RETURN
+# else
+ lea 14(%edx), %eax
+ ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+ .p2align 4
+L(match_case2):
+ sub %ecx, %edx
+L(match_case2_prolog1):
+ add %ecx, %edi
+L(match_case2_prolog):
+ test %al, %al
+ jz L(match_case2_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case2_8)
+ test $0x01, %al
+ jnz L(ExitCase2_1)
+ test $0x02, %al
+ jnz L(ExitCase2_2)
+ test $0x04, %al
+ jnz L(ExitCase2_3)
+ sub $4, %edx
+ jb L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_8):
+ test $0x10, %al
+ jnz L(ExitCase2_5)
+ test $0x20, %al
+ jnz L(ExitCase2_6)
+ test $0x40, %al
+ jnz L(ExitCase2_7)
+ sub $8, %edx
+ jb L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case2_high_8)
+ test $0x01, %ah
+ jnz L(ExitCase2_9)
+ test $0x02, %ah
+ jnz L(ExitCase2_10)
+ test $0x04, %ah
+ jnz L(ExitCase2_11)
+ sub $12, %edx
+ jb L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high_8):
+ test $0x10, %ah
+ jnz L(ExitCase2_13)
+ test $0x20, %ah
+ jnz L(ExitCase2_14)
+ test $0x40, %ah
+ jnz L(ExitCase2_15)
+ sub $16, %edx
+ jb L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_1):
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_2):
+ sub $2, %edx
+ jb L(return_null)
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_3):
+ sub $3, %edx
+ jb L(return_null)
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_5):
+ sub $5, %edx
+ jb L(return_null)
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_6):
+ sub $6, %edx
+ jb L(return_null)
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_7):
+ sub $7, %edx
+ jb L(return_null)
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_9):
+ sub $9, %edx
+ jb L(return_null)
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_10):
+ sub $10, %edx
+ jb L(return_null)
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_11):
+ sub $11, %edx
+ jb L(return_null)
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_13):
+ sub $13, %edx
+ jb L(return_null)
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_14):
+ sub $14, %edx
+ jb L(return_null)
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(ExitCase2_15):
+ sub $15, %edx
+ jb L(return_null)
+ lea 14(%edi), %eax
+ RETURN
+# endif
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ RETURN
+# else
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr.S b/libc/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 000000000..163a83e17
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of memchr
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+ .text
+ENTRY(__memchr)
+ .type __memchr, @gnu_indirect_function
+ pushl %ebx
+ CFI_PUSH (%ebx)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+ jz 3f
+
+ leal __memchr_sse2@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+2: leal __memchr_ia32@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+3: leal __memchr_sse2_bsf@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memchr_ia32, @function; \
+ .globl __memchr_ia32; \
+ .p2align 4; \
+ __memchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778f1..1f5dbc15c 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,84 +20,97 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_2
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1 + 4
+# define LEN BLK2 + 4
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
-#ifdef SHARED
-# define JMPTBL(I, B) I - B
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
- jmp *%ebx
-
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
- ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
- movl (%esp), %ebx
- ret
-#else
-# define JMPTBL(I, B) I
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
+ jmp *%ebx
+# else
+# define JMPTBL(I, B) I
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-#endif
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
.section .text.sse4.2,"ax",@progbits
ENTRY (MEMCMP)
movl BLK1(%esp), %eax
movl BLK2(%esp), %edx
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+# else
cmp $1, %ecx
jbe L(less1bytes)
+# endif
+
pxor %xmm0, %xmm0
cmp $64, %ecx
ja L(64bytesormore)
cmp $8, %ecx
- PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+# else
jb L(less8bytes)
+ PUSH (%ebx)
+# endif
+
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less8bytes):
mov (%eax), %bl
cmpb (%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
mov 6(%eax), %bl
cmpb 6(%edx), %bl
je L(0bytes)
+
L(nonzero):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(above)
neg %eax
L(above):
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+ .p2align 4
L(0bytes):
- POP (%ebx)
+ POP (%ebx)
xor %eax, %eax
ret
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(0bytesend)
movzbl (%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(0bytesend):
xor %eax, %eax
ret
-
- ALIGN (4)
+# endif
+ .p2align 4
L(64bytesormore):
- PUSH (%ebx)
+ PUSH (%ebx)
mov %ecx, %ebx
mov $64, %ecx
sub $64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+ .p2align 4
L(find_16diff):
sub $16, %ecx
L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
L(find_64diff):
add %ecx, %edx
add %ecx, %eax
- jmp L(16bytes)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(16bytes):
mov -16(%eax), %ecx
mov -16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(49bytes):
movdqu -49(%eax), %xmm1
movdqu -49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(50bytes):
mov $-50, %ebx
movdqu -50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(51bytes):
mov $-51, %ebx
movdqu -51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(52bytes):
movdqu -52(%eax), %xmm1
movdqu -52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(53bytes):
movdqu -53(%eax), %xmm1
movdqu -53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(54bytes):
movdqu -54(%eax), %xmm1
movdqu -54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(55bytes):
movdqu -55(%eax), %xmm1
movdqu -55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(56bytes):
movdqu -56(%eax), %xmm1
movdqu -56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
jnc L(less16bytes)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(57bytes):
movdqu -57(%eax), %xmm1
movdqu -57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(58bytes):
movdqu -58(%eax), %xmm1
movdqu -58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(59bytes):
movdqu -59(%eax), %xmm1
movdqu -59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(60bytes):
movdqu -60(%eax), %xmm1
movdqu -60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(61bytes):
movdqu -61(%eax), %xmm1
movdqu -61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(62bytes):
movdqu -62(%eax), %xmm1
movdqu -62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(63bytes):
movdqu -63(%eax), %xmm1
movdqu -63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
mov $0, %eax
jne L(end)
RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(64bytes):
movdqu -64(%eax), %xmm1
movdqu -64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
jnc L(less16bytes)
mov -16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -16(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -16(%edx), %ecx
+# endif
jne L(find_diff)
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(find_diff):
+# ifndef USE_AS_WMEMCMP
cmpb %bl, %cl
jne L(end)
cmp %bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
jne L(end)
cmp %bx, %cx
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.2,"a",@progbits
- ALIGN (2)
+ .p2align 2
.type L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
.int JMPTBL (L(62bytes), L(table_64bytes))
.int JMPTBL (L(63bytes), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
- .size L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# endif
#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15fe5..eab85c1de 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,47 +20,64 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
- .section .text.ssse3,"ax",@progbits
+ atom_text_section
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
- PUSH (%ebx)
+# endif
+
+ PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
- ALIGN (4)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
neg %eax
L(1bytesend):
ret
+# endif
- ALIGN (4)
+ .p2align 4
L(zero):
- mov $0, %eax
+ xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
cfi_remember_state
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
movl %eax, %edi
movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
lea 16(%edi), %edi
- sub $0xffff, %edx
+ sub $0xffff, %edx
lea 16(%esi), %esi
- jnz L(less16bytes)
+ jnz L(less16bytes)
mov %edi, %edx
and $0xf, %edx
xor %edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
jz L(shr_0)
xor %edx, %esi
+# ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
- ALIGN (4)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
+
L(first16bytes):
add %eax, %esi
L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@@ -1492,61 +1531,61 @@ L(less16bytes):
test $0x40, %dl
jnz L(Byte22)
L(Byte23):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte16):
- movzbl -16(%edi), %eax
- movzbl -16(%esi), %edx
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte17):
- movzbl -15(%edi), %eax
- movzbl -15(%esi), %edx
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte18):
- movzbl -14(%edi), %eax
- movzbl -14(%esi), %edx
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte19):
- movzbl -13(%edi), %eax
- movzbl -13(%esi), %edx
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte20):
- movzbl -12(%edi), %eax
- movzbl -12(%esi), %edx
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte21):
- movzbl -11(%edi), %eax
- movzbl -11(%esi), %edx
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte22):
- movzbl -10(%edi), %eax
- movzbl -10(%esi), %edx
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
- ALIGN (4)
+ .p2align 4
L(Byte31):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
CFI_PUSH (%ebx)
- ALIGN (4)
+
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
+# ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
+# ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
+# ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
+# ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
+# ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-
- ALIGN (4)
+ .p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
cmp %ebx, %ecx
mov $0, %eax
jne L(find_diff)
- POP (%ebx)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
cmp -1(%edx), %cl
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
cmp %bh, %ch
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
cmpb -1(%edx), %al
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
+
+ .p2align 4
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
-END (MEMCMP)
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index f64f8d214..26471fc0e 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -1,5 +1,5 @@
/* memcpy with SSSE3
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -235,7 +235,7 @@ L(shl_0_end):
add %edi, %edx
add %edi, %eax
POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
CFI_PUSH (%edi)
L(shl_0_gobble):
@@ -385,7 +385,7 @@ L(shl_0_mem_less_32bytes):
L(shl_0_mem_less_16bytes):
add %ecx, %edx
add %ecx, %eax
- BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
cfi_restore_state
cfi_remember_state
@@ -1065,38 +1065,48 @@ L(shl_15_end):
ALIGN (4)
L(fwd_write_44bytes):
- movl -44(%eax), %ecx
- movl %ecx, -44(%edx)
-L(fwd_write_40bytes):
- movl -40(%eax), %ecx
- movl %ecx, -40(%edx)
+ movq -44(%eax), %xmm0
+ movq %xmm0, -44(%edx)
L(fwd_write_36bytes):
- movl -36(%eax), %ecx
- movl %ecx, -36(%edx)
-L(fwd_write_32bytes):
- movl -32(%eax), %ecx
- movl %ecx, -32(%edx)
+ movq -36(%eax), %xmm0
+ movq %xmm0, -36(%edx)
L(fwd_write_28bytes):
- movl -28(%eax), %ecx
- movl %ecx, -28(%edx)
-L(fwd_write_24bytes):
- movl -24(%eax), %ecx
- movl %ecx, -24(%edx)
+ movq -28(%eax), %xmm0
+ movq %xmm0, -28(%edx)
L(fwd_write_20bytes):
- movl -20(%eax), %ecx
- movl %ecx, -20(%edx)
-L(fwd_write_16bytes):
- movl -16(%eax), %ecx
- movl %ecx, -16(%edx)
+ movq -20(%eax), %xmm0
+ movq %xmm0, -20(%edx)
L(fwd_write_12bytes):
- movl -12(%eax), %ecx
- movl %ecx, -12(%edx)
-L(fwd_write_8bytes):
- movl -8(%eax), %ecx
- movl %ecx, -8(%edx)
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
L(fwd_write_4bytes):
movl -4(%eax), %ecx
movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_40bytes):
+ movq -40(%eax), %xmm0
+ movq %xmm0, -40(%edx)
+L(fwd_write_32bytes):
+ movq -32(%eax), %xmm0
+ movq %xmm0, -32(%edx)
+L(fwd_write_24bytes):
+ movq -24(%eax), %xmm0
+ movq %xmm0, -24(%edx)
+L(fwd_write_16bytes):
+ movq -16(%eax), %xmm0
+ movq %xmm0, -16(%edx)
+L(fwd_write_8bytes):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
L(fwd_write_0bytes):
#ifndef USE_AS_BCOPY
# ifdef USE_AS_MEMPCPY
@@ -1124,37 +1134,49 @@ L(fwd_write_5bytes):
ALIGN (4)
L(fwd_write_45bytes):
- movl -45(%eax), %ecx
- movl %ecx, -45(%edx)
-L(fwd_write_41bytes):
- movl -41(%eax), %ecx
- movl %ecx, -41(%edx)
+ movq -45(%eax), %xmm0
+ movq %xmm0, -45(%edx)
L(fwd_write_37bytes):
- movl -37(%eax), %ecx
- movl %ecx, -37(%edx)
-L(fwd_write_33bytes):
- movl -33(%eax), %ecx
- movl %ecx, -33(%edx)
+ movq -37(%eax), %xmm0
+ movq %xmm0, -37(%edx)
L(fwd_write_29bytes):
- movl -29(%eax), %ecx
- movl %ecx, -29(%edx)
-L(fwd_write_25bytes):
- movl -25(%eax), %ecx
- movl %ecx, -25(%edx)
+ movq -29(%eax), %xmm0
+ movq %xmm0, -29(%edx)
L(fwd_write_21bytes):
- movl -21(%eax), %ecx
- movl %ecx, -21(%edx)
-L(fwd_write_17bytes):
- movl -17(%eax), %ecx
- movl %ecx, -17(%edx)
+ movq -21(%eax), %xmm0
+ movq %xmm0, -21(%edx)
L(fwd_write_13bytes):
- movl -13(%eax), %ecx
- movl %ecx, -13(%edx)
-L(fwd_write_9bytes):
- movl -9(%eax), %ecx
- movl %ecx, -9(%edx)
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
movl -5(%eax), %ecx
movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_41bytes):
+ movq -41(%eax), %xmm0
+ movq %xmm0, -41(%edx)
+L(fwd_write_33bytes):
+ movq -33(%eax), %xmm0
+ movq %xmm0, -33(%edx)
+L(fwd_write_25bytes):
+ movq -25(%eax), %xmm0
+ movq %xmm0, -25(%edx)
+L(fwd_write_17bytes):
+ movq -17(%eax), %xmm0
+ movq %xmm0, -17(%edx)
+L(fwd_write_9bytes):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
L(fwd_write_1bytes):
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
@@ -1169,38 +1191,50 @@ L(fwd_write_1bytes):
ALIGN (4)
L(fwd_write_46bytes):
- movl -46(%eax), %ecx
- movl %ecx, -46(%edx)
-L(fwd_write_42bytes):
- movl -42(%eax), %ecx
- movl %ecx, -42(%edx)
+ movq -46(%eax), %xmm0
+ movq %xmm0, -46(%edx)
L(fwd_write_38bytes):
- movl -38(%eax), %ecx
- movl %ecx, -38(%edx)
-L(fwd_write_34bytes):
- movl -34(%eax), %ecx
- movl %ecx, -34(%edx)
+ movq -38(%eax), %xmm0
+ movq %xmm0, -38(%edx)
L(fwd_write_30bytes):
- movl -30(%eax), %ecx
- movl %ecx, -30(%edx)
-L(fwd_write_26bytes):
- movl -26(%eax), %ecx
- movl %ecx, -26(%edx)
+ movq -30(%eax), %xmm0
+ movq %xmm0, -30(%edx)
L(fwd_write_22bytes):
- movl -22(%eax), %ecx
- movl %ecx, -22(%edx)
-L(fwd_write_18bytes):
- movl -18(%eax), %ecx
- movl %ecx, -18(%edx)
+ movq -22(%eax), %xmm0
+ movq %xmm0, -22(%edx)
L(fwd_write_14bytes):
- movl -14(%eax), %ecx
- movl %ecx, -14(%edx)
-L(fwd_write_10bytes):
- movl -10(%eax), %ecx
- movl %ecx, -10(%edx)
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
L(fwd_write_6bytes):
movl -6(%eax), %ecx
movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_42bytes):
+ movq -42(%eax), %xmm0
+ movq %xmm0, -42(%edx)
+L(fwd_write_34bytes):
+ movq -34(%eax), %xmm0
+ movq %xmm0, -34(%edx)
+L(fwd_write_26bytes):
+ movq -26(%eax), %xmm0
+ movq %xmm0, -26(%edx)
+L(fwd_write_18bytes):
+ movq -18(%eax), %xmm0
+ movq %xmm0, -18(%edx)
+L(fwd_write_10bytes):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
L(fwd_write_2bytes):
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
@@ -1215,38 +1249,52 @@ L(fwd_write_2bytes):
ALIGN (4)
L(fwd_write_47bytes):
- movl -47(%eax), %ecx
- movl %ecx, -47(%edx)
-L(fwd_write_43bytes):
- movl -43(%eax), %ecx
- movl %ecx, -43(%edx)
+ movq -47(%eax), %xmm0
+ movq %xmm0, -47(%edx)
L(fwd_write_39bytes):
- movl -39(%eax), %ecx
- movl %ecx, -39(%edx)
-L(fwd_write_35bytes):
- movl -35(%eax), %ecx
- movl %ecx, -35(%edx)
+ movq -39(%eax), %xmm0
+ movq %xmm0, -39(%edx)
L(fwd_write_31bytes):
- movl -31(%eax), %ecx
- movl %ecx, -31(%edx)
-L(fwd_write_27bytes):
- movl -27(%eax), %ecx
- movl %ecx, -27(%edx)
+ movq -31(%eax), %xmm0
+ movq %xmm0, -31(%edx)
L(fwd_write_23bytes):
- movl -23(%eax), %ecx
- movl %ecx, -23(%edx)
-L(fwd_write_19bytes):
- movl -19(%eax), %ecx
- movl %ecx, -19(%edx)
+ movq -23(%eax), %xmm0
+ movq %xmm0, -23(%edx)
L(fwd_write_15bytes):
- movl -15(%eax), %ecx
- movl %ecx, -15(%edx)
-L(fwd_write_11bytes):
- movl -11(%eax), %ecx
- movl %ecx, -11(%edx)
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
L(fwd_write_7bytes):
movl -7(%eax), %ecx
movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_43bytes):
+ movq -43(%eax), %xmm0
+ movq %xmm0, -43(%edx)
+L(fwd_write_35bytes):
+ movq -35(%eax), %xmm0
+ movq %xmm0, -35(%edx)
+L(fwd_write_27bytes):
+ movq -27(%eax), %xmm0
+ movq %xmm0, -27(%edx)
+L(fwd_write_19bytes):
+ movq -19(%eax), %xmm0
+ movq %xmm0, -19(%edx)
+L(fwd_write_11bytes):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
L(fwd_write_3bytes):
movzwl -3(%eax), %ecx
movzbl -1(%eax), %eax
@@ -1259,6 +1307,356 @@ L(fwd_write_3bytes):
movl DEST(%esp), %eax
# endif
#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_40bytes_align):
+ movdqa -40(%eax), %xmm0
+ movdqa %xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+ movdqa -24(%eax), %xmm0
+ movdqa %xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+ movq -8(%eax), %xmm0
+ movq %xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_32bytes_align):
+ movdqa -32(%eax), %xmm0
+ movdqa %xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+ movdqa -16(%eax), %xmm0
+ movdqa %xmm0, -16(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_5bytes_align):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_45bytes_align):
+ movdqa -45(%eax), %xmm0
+ movdqa %xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+ movdqa -29(%eax), %xmm0
+ movdqa %xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+ movq -13(%eax), %xmm0
+ movq %xmm0, -13(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_37bytes_align):
+ movdqa -37(%eax), %xmm0
+ movdqa %xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+ movdqa -21(%eax), %xmm0
+ movdqa %xmm0, -21(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_41bytes_align):
+ movdqa -41(%eax), %xmm0
+ movdqa %xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+ movdqa -25(%eax), %xmm0
+ movdqa %xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+ movq -9(%eax), %xmm0
+ movq %xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_33bytes_align):
+ movdqa -33(%eax), %xmm0
+ movdqa %xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+ movdqa -17(%eax), %xmm0
+ movdqa %xmm0, -17(%edx)
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_46bytes_align):
+ movdqa -46(%eax), %xmm0
+ movdqa %xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+ movdqa -30(%eax), %xmm0
+ movdqa %xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+ movq -14(%eax), %xmm0
+ movq %xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_38bytes_align):
+ movdqa -38(%eax), %xmm0
+ movdqa %xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+ movdqa -22(%eax), %xmm0
+ movdqa %xmm0, -22(%edx)
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_42bytes_align):
+ movdqa -42(%eax), %xmm0
+ movdqa %xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+ movdqa -26(%eax), %xmm0
+ movdqa %xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+ movq -10(%eax), %xmm0
+ movq %xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_34bytes_align):
+ movdqa -34(%eax), %xmm0
+ movdqa %xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+ movdqa -18(%eax), %xmm0
+ movdqa %xmm0, -18(%edx)
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_47bytes_align):
+ movdqa -47(%eax), %xmm0
+ movdqa %xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+ movdqa -31(%eax), %xmm0
+ movdqa %xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+ movq -15(%eax), %xmm0
+ movq %xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_39bytes_align):
+ movdqa -39(%eax), %xmm0
+ movdqa %xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+ movdqa -23(%eax), %xmm0
+ movdqa %xmm0, -23(%edx)
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_43bytes_align):
+ movdqa -43(%eax), %xmm0
+ movdqa %xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+ movdqa -27(%eax), %xmm0
+ movdqa %xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+ movq -11(%eax), %xmm0
+ movq %xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_35bytes_align):
+ movdqa -35(%eax), %xmm0
+ movdqa %xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+ movdqa -19(%eax), %xmm0
+ movdqa %xmm0, -19(%edx)
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_44bytes_align):
+ movdqa -44(%eax), %xmm0
+ movdqa %xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+ movdqa -28(%eax), %xmm0
+ movdqa %xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+ movq -12(%eax), %xmm0
+ movq %xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_36bytes_align):
+ movdqa -36(%eax), %xmm0
+ movdqa %xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+ movdqa -20(%eax), %xmm0
+ movdqa %xmm0, -20(%edx)
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
RETURN_END
cfi_restore_state
@@ -1330,35 +1728,20 @@ L(large_page_less_32bytes):
ALIGN (4)
L(bk_write_44bytes):
- movl 40(%eax), %ecx
- movl %ecx, 40(%edx)
-L(bk_write_40bytes):
- movl 36(%eax), %ecx
- movl %ecx, 36(%edx)
+ movq 36(%eax), %xmm0
+ movq %xmm0, 36(%edx)
L(bk_write_36bytes):
- movl 32(%eax), %ecx
- movl %ecx, 32(%edx)
-L(bk_write_32bytes):
- movl 28(%eax), %ecx
- movl %ecx, 28(%edx)
+ movq 28(%eax), %xmm0
+ movq %xmm0, 28(%edx)
L(bk_write_28bytes):
- movl 24(%eax), %ecx
- movl %ecx, 24(%edx)
-L(bk_write_24bytes):
- movl 20(%eax), %ecx
- movl %ecx, 20(%edx)
+ movq 20(%eax), %xmm0
+ movq %xmm0, 20(%edx)
L(bk_write_20bytes):
- movl 16(%eax), %ecx
- movl %ecx, 16(%edx)
-L(bk_write_16bytes):
- movl 12(%eax), %ecx
- movl %ecx, 12(%edx)
+ movq 12(%eax), %xmm0
+ movq %xmm0, 12(%edx)
L(bk_write_12bytes):
- movl 8(%eax), %ecx
- movl %ecx, 8(%edx)
-L(bk_write_8bytes):
- movl 4(%eax), %ecx
- movl %ecx, 4(%edx)
+ movq 4(%eax), %xmm0
+ movq %xmm0, 4(%edx)
L(bk_write_4bytes):
movl (%eax), %ecx
movl %ecx, (%edx)
@@ -1373,36 +1756,46 @@ L(bk_write_0bytes):
RETURN
ALIGN (4)
+L(bk_write_40bytes):
+ movq 32(%eax), %xmm0
+ movq %xmm0, 32(%edx)
+L(bk_write_32bytes):
+ movq 24(%eax), %xmm0
+ movq %xmm0, 24(%edx)
+L(bk_write_24bytes):
+ movq 16(%eax), %xmm0
+ movq %xmm0, 16(%edx)
+L(bk_write_16bytes):
+ movq 8(%eax), %xmm0
+ movq %xmm0, 8(%edx)
+L(bk_write_8bytes):
+ movq (%eax), %xmm0
+ movq %xmm0, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
L(bk_write_45bytes):
- movl 41(%eax), %ecx
- movl %ecx, 41(%edx)
-L(bk_write_41bytes):
- movl 37(%eax), %ecx
- movl %ecx, 37(%edx)
+ movq 37(%eax), %xmm0
+ movq %xmm0, 37(%edx)
L(bk_write_37bytes):
- movl 33(%eax), %ecx
- movl %ecx, 33(%edx)
-L(bk_write_33bytes):
- movl 29(%eax), %ecx
- movl %ecx, 29(%edx)
+ movq 29(%eax), %xmm0
+ movq %xmm0, 29(%edx)
L(bk_write_29bytes):
- movl 25(%eax), %ecx
- movl %ecx, 25(%edx)
-L(bk_write_25bytes):
- movl 21(%eax), %ecx
- movl %ecx, 21(%edx)
+ movq 21(%eax), %xmm0
+ movq %xmm0, 21(%edx)
L(bk_write_21bytes):
- movl 17(%eax), %ecx
- movl %ecx, 17(%edx)
-L(bk_write_17bytes):
- movl 13(%eax), %ecx
- movl %ecx, 13(%edx)
+ movq 13(%eax), %xmm0
+ movq %xmm0, 13(%edx)
L(bk_write_13bytes):
- movl 9(%eax), %ecx
- movl %ecx, 9(%edx)
-L(bk_write_9bytes):
- movl 5(%eax), %ecx
- movl %ecx, 5(%edx)
+ movq 5(%eax), %xmm0
+ movq %xmm0, 5(%edx)
L(bk_write_5bytes):
movl 1(%eax), %ecx
movl %ecx, 1(%edx)
@@ -1419,39 +1812,78 @@ L(bk_write_1bytes):
RETURN
ALIGN (4)
+L(bk_write_41bytes):
+ movq 33(%eax), %xmm0
+ movq %xmm0, 33(%edx)
+L(bk_write_33bytes):
+ movq 25(%eax), %xmm0
+ movq %xmm0, 25(%edx)
+L(bk_write_25bytes):
+ movq 17(%eax), %xmm0
+ movq %xmm0, 17(%edx)
+L(bk_write_17bytes):
+ movq 9(%eax), %xmm0
+ movq %xmm0, 9(%edx)
+L(bk_write_9bytes):
+ movq 1(%eax), %xmm0
+ movq %xmm0, 1(%edx)
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
L(bk_write_46bytes):
- movl 42(%eax), %ecx
- movl %ecx, 42(%edx)
-L(bk_write_42bytes):
- movl 38(%eax), %ecx
- movl %ecx, 38(%edx)
+ movq 38(%eax), %xmm0
+ movq %xmm0, 38(%edx)
L(bk_write_38bytes):
- movl 34(%eax), %ecx
- movl %ecx, 34(%edx)
-L(bk_write_34bytes):
- movl 30(%eax), %ecx
- movl %ecx, 30(%edx)
+ movq 30(%eax), %xmm0
+ movq %xmm0, 30(%edx)
L(bk_write_30bytes):
- movl 26(%eax), %ecx
- movl %ecx, 26(%edx)
-L(bk_write_26bytes):
- movl 22(%eax), %ecx
- movl %ecx, 22(%edx)
+ movq 22(%eax), %xmm0
+ movq %xmm0, 22(%edx)
L(bk_write_22bytes):
- movl 18(%eax), %ecx
- movl %ecx, 18(%edx)
-L(bk_write_18bytes):
- movl 14(%eax), %ecx
- movl %ecx, 14(%edx)
+ movq 14(%eax), %xmm0
+ movq %xmm0, 14(%edx)
L(bk_write_14bytes):
- movl 10(%eax), %ecx
- movl %ecx, 10(%edx)
-L(bk_write_10bytes):
- movl 6(%eax), %ecx
- movl %ecx, 6(%edx)
+ movq 6(%eax), %xmm0
+ movq %xmm0, 6(%edx)
L(bk_write_6bytes):
movl 2(%eax), %ecx
movl %ecx, 2(%edx)
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_42bytes):
+ movq 34(%eax), %xmm0
+ movq %xmm0, 34(%edx)
+L(bk_write_34bytes):
+ movq 26(%eax), %xmm0
+ movq %xmm0, 26(%edx)
+L(bk_write_26bytes):
+ movq 18(%eax), %xmm0
+ movq %xmm0, 18(%edx)
+L(bk_write_18bytes):
+ movq 10(%eax), %xmm0
+ movq %xmm0, 10(%edx)
+L(bk_write_10bytes):
+ movq 2(%eax), %xmm0
+ movq %xmm0, 2(%edx)
L(bk_write_2bytes):
movzwl (%eax), %ecx
movw %cx, (%edx)
@@ -1466,38 +1898,52 @@ L(bk_write_2bytes):
ALIGN (4)
L(bk_write_47bytes):
- movl 43(%eax), %ecx
- movl %ecx, 43(%edx)
-L(bk_write_43bytes):
- movl 39(%eax), %ecx
- movl %ecx, 39(%edx)
+ movq 39(%eax), %xmm0
+ movq %xmm0, 39(%edx)
L(bk_write_39bytes):
- movl 35(%eax), %ecx
- movl %ecx, 35(%edx)
-L(bk_write_35bytes):
- movl 31(%eax), %ecx
- movl %ecx, 31(%edx)
+ movq 31(%eax), %xmm0
+ movq %xmm0, 31(%edx)
L(bk_write_31bytes):
- movl 27(%eax), %ecx
- movl %ecx, 27(%edx)
-L(bk_write_27bytes):
- movl 23(%eax), %ecx
- movl %ecx, 23(%edx)
+ movq 23(%eax), %xmm0
+ movq %xmm0, 23(%edx)
L(bk_write_23bytes):
- movl 19(%eax), %ecx
- movl %ecx, 19(%edx)
-L(bk_write_19bytes):
- movl 15(%eax), %ecx
- movl %ecx, 15(%edx)
+ movq 15(%eax), %xmm0
+ movq %xmm0, 15(%edx)
L(bk_write_15bytes):
- movl 11(%eax), %ecx
- movl %ecx, 11(%edx)
-L(bk_write_11bytes):
- movl 7(%eax), %ecx
- movl %ecx, 7(%edx)
+ movq 7(%eax), %xmm0
+ movq %xmm0, 7(%edx)
L(bk_write_7bytes):
movl 3(%eax), %ecx
movl %ecx, 3(%edx)
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_43bytes):
+ movq 35(%eax), %xmm0
+ movq %xmm0, 35(%edx)
+L(bk_write_35bytes):
+ movq 27(%eax), %xmm0
+ movq %xmm0, 27(%edx)
+L(bk_write_27bytes):
+ movq 19(%eax), %xmm0
+ movq %xmm0, 19(%edx)
+L(bk_write_19bytes):
+ movq 11(%eax), %xmm0
+ movq %xmm0, 11(%edx)
+L(bk_write_11bytes):
+ movq 3(%eax), %xmm0
+ movq %xmm0, 3(%edx)
L(bk_write_3bytes):
movzwl 1(%eax), %ecx
movw %cx, 1(%edx)
@@ -1566,6 +2012,57 @@ L(table_48bytes_fwd):
.int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
ALIGN (2)
+L(table_48bytes_fwd_align):
+ .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+ .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+ ALIGN (2)
L(shl_table):
.int JMPTBL (L(shl_0), L(shl_table))
.int JMPTBL (L(shl_1), L(shl_table))
@@ -1658,22 +2155,14 @@ L(bk_write_64bytesless):
L(bk_write_more32bytes):
/* Copy 32 bytes at a time. */
sub $32, %ecx
- movl -4(%esi), %eax
- movl %eax, -4(%edx)
- movl -8(%esi), %eax
- movl %eax, -8(%edx)
- movl -12(%esi), %eax
- movl %eax, -12(%edx)
- movl -16(%esi), %eax
- movl %eax, -16(%edx)
- movl -20(%esi), %eax
- movl %eax, -20(%edx)
- movl -24(%esi), %eax
- movl %eax, -24(%edx)
- movl -28(%esi), %eax
- movl %eax, -28(%edx)
- movl -32(%esi), %eax
- movl %eax, -32(%edx)
+ movq -8(%esi), %xmm0
+ movq %xmm0, -8(%edx)
+ movq -16(%esi), %xmm0
+ movq %xmm0, -16(%edx)
+ movq -24(%esi), %xmm0
+ movq %xmm0, -24(%edx)
+ movq -32(%esi), %xmm0
+ movq %xmm0, -32(%edx)
sub $32, %edx
sub $32, %esi
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-c.c b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 000000000..44ec1a6ed
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#ifndef NOT_IN_libc
+# define MEMRCHR __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 000000000..355d498e2
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,418 @@
+/* Optimized memrchr with sse2
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+ .text
+ENTRY (MEMCHR)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ add $16, %ecx
+ add $16, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ add $64, %ecx
+ add $64, %edx
+ sub %eax, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ bsr %eax, %eax
+
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches0):
+ bsr %eax, %eax
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsr %eax, %eax
+ lea 16(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsr %eax, %eax
+ lea 32(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ bsr %eax, %eax
+ lea 48(%eax, %ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ bsr %eax, %eax
+ sub $64, %edx
+ add %eax, %edx
+ jl L(return_null)
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ bsr %eax, %eax
+ sub $48, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 16(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ bsr %eax, %eax
+ sub $32, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 32(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ bsr %eax, %eax
+ sub $16, %edx
+ add %eax, %edx
+ jl L(return_null)
+ lea 48(%ecx, %eax), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+ mov %edx, %ecx
+
+ pmovmskb %xmm1, %edx
+
+ and %ecx, %edx
+ test %edx, %edx
+ jz L(return_null)
+
+ bsr %edx, %ecx
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ mov %ecx, %eax
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ jz L(return_null)
+
+ pshufd $0, %xmm1, %xmm1
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 000000000..86a0cf961
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,725 @@
+/* Optimized memrchr with sse2 without bsf
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__memrchr_sse2)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ lea 16(%ecx), %ecx
+ lea 16(%edx), %edx
+ sub %eax, %edx
+ and $-16, %ecx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ lea 64(%ecx), %ecx
+ lea 64(%edx), %edx
+ and $-64, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ lea 16(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ lea 32(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch):
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_8):
+ test $0x80, %al
+ jnz L(exit_8)
+ test $0x40, %al
+ jnz L(exit_7)
+ test $0x20, %al
+ jnz L(exit_6)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(exit_dispatch_high_8)
+ test $0x08, %ah
+ jnz L(exit_12)
+ test $0x04, %ah
+ jnz L(exit_11)
+ test $0x02, %ah
+ jnz L(exit_10)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high_8):
+ test $0x80, %ah
+ jnz L(exit_16)
+ test $0x40, %ah
+ jnz L(exit_15)
+ test $0x20, %ah
+ jnz L(exit_14)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_2):
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_4):
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_6):
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_7):
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_8):
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_10):
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_11):
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_12):
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_14):
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_15):
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_16):
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ lea -64(%edx), %edx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ lea -48(%edx), %edx
+ lea 16(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ lea -32(%edx), %edx
+ lea 32(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ lea -16(%edx), %edx
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch_1):
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_8):
+ test $0x80, %al
+ jnz L(exit_1_8)
+ test $0x40, %al
+ jnz L(exit_1_7)
+ test $0x20, %al
+ jnz L(exit_1_6)
+ add $4, %edx
+ jl L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high):
+ mov %ah, %al
+ and $15 << 4, %al
+ jnz L(exit_dispatch_1_high_8)
+ test $0x08, %ah
+ jnz L(exit_1_12)
+ test $0x04, %ah
+ jnz L(exit_1_11)
+ test $0x02, %ah
+ jnz L(exit_1_10)
+ add $8, %edx
+ jl L(return_null)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high_8):
+ test $0x80, %ah
+ jnz L(exit_1_16)
+ test $0x40, %ah
+ jnz L(exit_1_15)
+ test $0x20, %ah
+ jnz L(exit_1_14)
+ add $12, %edx
+ jl L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_2):
+ add $1, %edx
+ jl L(return_null)
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_3):
+ add $2, %edx
+ jl L(return_null)
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_4):
+ add $3, %edx
+ jl L(return_null)
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_6):
+ add $5, %edx
+ jl L(return_null)
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_7):
+ add $6, %edx
+ jl L(return_null)
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_8):
+ add $7, %edx
+ jl L(return_null)
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_10):
+ add $9, %edx
+ jl L(return_null)
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_11):
+ add $10, %edx
+ jl L(return_null)
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_12):
+ add $11, %edx
+ jl L(return_null)
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_14):
+ add $13, %edx
+ jl L(return_null)
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_15):
+ add $14, %edx
+ jl L(return_null)
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_16):
+ add $15, %edx
+ jl L(return_null)
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ mov %eax, %ecx
+ pmovmskb %xmm1, %eax
+
+ and %edx, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ je L(return_null)
+ punpcklbw %xmm1, %xmm1
+
+ mov %ecx, %eax
+ pshufd $0, %xmm1, %xmm1
+
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr.S b/libc/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 000000000..8e5b2c50a
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,79 @@
+/* Multiple versions of memrchr
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+ .text
+ENTRY(__memrchr)
+ .type __memrchr, @gnu_indirect_function
+ pushl %ebx
+ CFI_PUSH (%ebx)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+ jz 3f
+
+ leal __memrchr_sse2@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+2: leal __memrchr_ia32@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+3: leal __memrchr_sse2_bsf@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 000000000..88c0e5776
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 000000000..038c74896
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 000000000..111f0dcf6
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of rawmemchr
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+ .text
+ENTRY(__rawmemchr)
+ .type __rawmemchr, @gnu_indirect_function
+ pushl %ebx
+ CFI_PUSH (%ebx)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+ jz 3f
+
+ leal __rawmemchr_sse2@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+2: leal __rawmemchr_ia32@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+
+ CFI_PUSH (%ebx)
+
+3: leal __rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax
+ popl %ebx
+ CFI_POP (%ebx)
+ ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __rawmemchr_ia32, @function; \
+ .globl __rawmemchr_ia32; \
+ .p2align 4; \
+ __rawmemchr_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 000000000..1aa544064
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
index a73b21ecc..9cc5ae8d1 100644
--- a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -40,7 +40,7 @@
# define STR1 PARMS
# define STR2 STR1+4
- .text
+ atom_text_section
ENTRY (__strchr_sse2)
ENTRANCE
diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
index 0dc651f01..ce50e0a33 100644
--- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
+++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -1,5 +1,5 @@
/* strlen with SSE2 and BSF
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -21,7 +21,6 @@
#if defined SHARED && !defined NOT_IN_libc
#include <sysdep.h>
-#include "asm-syntax.h"
#define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
index ca549bafc..91b6d799c 100644
--- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -18,31 +18,46 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#if (defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc
+
# ifndef USE_AS_STRCAT
# include <sysdep.h>
-# include "asm-syntax.h"
+# define PARMS 4
+# define STR PARMS
+# define RETURN ret
-# define CFI_PUSH(REG) \
+# ifdef USE_AS_STRNLEN
+# define LEN PARMS + 8
+# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
-# define CFI_POP(REG) \
+# define CFI_POP(REG) \
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
-# define PUSH(REG) pushl REG; CFI_PUSH (REG)
-# define POP(REG) popl REG; CFI_POP (REG)
-# define PARMS 4
-# define STR PARMS
-# define ENTRANCE
-# define RETURN ret
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+# undef RETURN
+# define RETURN POP (%edi); CFI_PUSH(%edi); ret
+# endif
- .text
-ENTRY (__strlen_sse2)
- ENTRANCE
+# ifndef STRLEN
+# define STRLEN __strlen_sse2
+# endif
+
+ atom_text_section
+ENTRY (STRLEN)
mov STR(%esp), %edx
+# ifdef USE_AS_STRNLEN
+ PUSH (%edi)
+ movl LEN(%esp), %edi
+ sub $4, %edi
+ jbe L(len_less4_prolog)
+# endif
# endif
xor %eax, %eax
cmpb $0, (%edx)
@@ -53,6 +68,12 @@ ENTRY (__strlen_sse2)
jz L(exit_tail2)
cmpb $0, 3(%edx)
jz L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less8_prolog)
+# endif
+
cmpb $0, 4(%edx)
jz L(exit_tail4)
cmpb $0, 5(%edx)
@@ -61,6 +82,12 @@ ENTRY (__strlen_sse2)
jz L(exit_tail6)
cmpb $0, 7(%edx)
jz L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less12_prolog)
+# endif
+
cmpb $0, 8(%edx)
jz L(exit_tail8)
cmpb $0, 9(%edx)
@@ -69,6 +96,12 @@ ENTRY (__strlen_sse2)
jz L(exit_tail10)
cmpb $0, 11(%edx)
jz L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less16_prolog)
+# endif
+
cmpb $0, 12(%edx)
jz L(exit_tail12)
cmpb $0, 13(%edx)
@@ -77,11 +110,18 @@ ENTRY (__strlen_sse2)
jz L(exit_tail14)
cmpb $0, 15(%edx)
jz L(exit_tail15)
+
pxor %xmm0, %xmm0
- mov %edx, %eax
- lea 16(%edx), %ecx
+ lea 16(%edx), %eax
+ mov %eax, %ecx
and $-16, %eax
- add $16, %eax
+
+# ifdef USE_AS_STRNLEN
+ and $15, %edx
+ add %edx, %edi
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
@@ -97,7 +137,6 @@ ENTRY (__strlen_sse2)
lea 16(%eax), %eax
jnz L(exit)
-
pcmpeqb (%eax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
@@ -111,6 +150,11 @@ ENTRY (__strlen_sse2)
lea 16(%eax), %eax
jnz L(exit)
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
@@ -135,6 +179,11 @@ ENTRY (__strlen_sse2)
lea 16(%eax), %eax
jnz L(exit)
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
@@ -159,6 +208,11 @@ ENTRY (__strlen_sse2)
lea 16(%eax), %eax
jnz L(exit)
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
@@ -183,8 +237,20 @@ ENTRY (__strlen_sse2)
lea 16(%eax), %eax
jnz L(exit)
+# ifdef USE_AS_STRNLEN
+ mov %eax, %edx
+ and $63, %edx
+ add %edx, %edi
+# endif
+
and $-0x40, %eax
-L(aligned_64):
+
+ .p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+# endif
movaps (%eax), %xmm0
movaps 16(%eax), %xmm1
movaps 32(%eax), %xmm2
@@ -196,7 +262,7 @@ L(aligned_64):
pmovmskb %xmm2, %edx
test %edx, %edx
lea 64(%eax), %eax
- jz L(aligned_64)
+ jz L(aligned_64_loop)
pcmpeqb -64(%eax), %xmm3
pmovmskb %xmm3, %edx
@@ -223,56 +289,348 @@ L(exit):
sub %ecx, %eax
test %dl, %dl
jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_8)
test $0x01, %dl
jnz L(exit_tail0)
-
test $0x02, %dl
jnz L(exit_tail1)
-
test $0x04, %dl
jnz L(exit_tail2)
+ add $3, %eax
+ RETURN
- test $0x08, %dl
- jnz L(exit_tail3)
-
+ .p2align 4
+L(exit_8):
test $0x10, %dl
jnz L(exit_tail4)
-
test $0x20, %dl
jnz L(exit_tail5)
-
test $0x40, %dl
jnz L(exit_tail6)
add $7, %eax
-L(exit_tail0):
RETURN
+ .p2align 4
L(exit_high):
- add $8, %eax
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_high_8)
test $0x01, %dh
+ jnz L(exit_tail8)
+ test $0x02, %dh
+ jnz L(exit_tail9)
+ test $0x04, %dh
+ jnz L(exit_tail10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high_8):
+ test $0x10, %dh
+ jnz L(exit_tail12)
+ test $0x20, %dh
+ jnz L(exit_tail13)
+ test $0x40, %dh
+ jnz L(exit_tail14)
+ add $15, %eax
+L(exit_tail0):
+ RETURN
+
+# ifdef USE_AS_STRNLEN
+
+ .p2align 4
+L(len_less64):
+ pxor %xmm0, %xmm0
+ add $64, %edi
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ movl LEN(%esp), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit):
+ sub %ecx, %eax
+
+ test %dl, %dl
+ jz L(strnlen_exit_high)
+ mov %dl, %cl
+ and $15, %cl
+ jz L(strnlen_exit_8)
+ test $0x01, %dl
jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(strnlen_exit_tail1)
+ test $0x04, %dl
+ jnz L(strnlen_exit_tail2)
+ sub $4, %edi
+ jb L(return_start_len)
+ lea 3(%eax), %eax
+ RETURN
- test $0x02, %dh
- jnz L(exit_tail1)
+ .p2align 4
+L(strnlen_exit_8):
+ test $0x10, %dl
+ jnz L(strnlen_exit_tail4)
+ test $0x20, %dl
+ jnz L(strnlen_exit_tail5)
+ test $0x40, %dl
+ jnz L(strnlen_exit_tail6)
+ sub $8, %edi
+ jb L(return_start_len)
+ lea 7(%eax), %eax
+ RETURN
+ .p2align 4
+L(strnlen_exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(strnlen_exit_high_8)
+ test $0x01, %dh
+ jnz L(strnlen_exit_tail8)
+ test $0x02, %dh
+ jnz L(strnlen_exit_tail9)
test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
+ jnz L(strnlen_exit_tail10)
+ sub $12, %edi
+ jb L(return_start_len)
+ lea 11(%eax), %eax
+ RETURN
+ .p2align 4
+L(strnlen_exit_high_8):
test $0x10, %dh
- jnz L(exit_tail4)
-
+ jnz L(strnlen_exit_tail12)
test $0x20, %dh
- jnz L(exit_tail5)
-
+ jnz L(strnlen_exit_tail13)
test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
+ jnz L(strnlen_exit_tail14)
+ sub $16, %edi
+ jb L(return_start_len)
+ lea 15(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail1):
+ sub $2, %edi
+ jb L(return_start_len)
+ lea 1(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail2):
+ sub $3, %edi
+ jb L(return_start_len)
+ lea 2(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail4):
+ sub $5, %edi
+ jb L(return_start_len)
+ lea 4(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail5):
+ sub $6, %edi
+ jb L(return_start_len)
+ lea 5(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail6):
+ sub $7, %edi
+ jb L(return_start_len)
+ lea 6(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail8):
+ sub $9, %edi
+ jb L(return_start_len)
+ lea 8(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail9):
+ sub $10, %edi
+ jb L(return_start_len)
+ lea 9(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail10):
+ sub $11, %edi
+ jb L(return_start_len)
+ lea 10(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail12):
+ sub $13, %edi
+ jb L(return_start_len)
+ lea 12(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail13):
+ sub $14, %edi
+ jb L(return_start_len)
+ lea 13(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail14):
+ sub $15, %edi
+ jb L(return_start_len)
+ lea 14(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(return_start_len):
+ movl LEN(%esp), %eax
+ RETURN
+
+/* for prolog only */
+
+ .p2align 4
+L(len_less4_prolog):
+ xor %eax, %eax
+
+ add $4, %edi
+ jz L(exit_tail0)
+
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmp $1, %edi
+ je L(exit_tail1)
+
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmp $2, %edi
+ je L(exit_tail2)
+
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmp $3, %edi
+ je L(exit_tail3)
+
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+ mov $4, %eax
RETURN
.p2align 4
+L(len_less8_prolog):
+ add $4, %edi
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmp $1, %edi
+ je L(exit_tail5)
+
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmp $2, %edi
+ je L(exit_tail6)
+
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmp $3, %edi
+ je L(exit_tail7)
+
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+ mov $8, %eax
+ RETURN
+
+
+ .p2align 4
+L(len_less12_prolog):
+ add $4, %edi
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmp $1, %edi
+ je L(exit_tail9)
+
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmp $2, %edi
+ je L(exit_tail10)
+
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmp $3, %edi
+ je L(exit_tail11)
+
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+ mov $12, %eax
+ RETURN
+
+ .p2align 4
+L(len_less16_prolog):
+ add $4, %edi
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmp $1, %edi
+ je L(exit_tail13)
+
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmp $2, %edi
+ je L(exit_tail14)
+
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmp $3, %edi
+ je L(exit_tail15)
+
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+ mov $16, %eax
+ RETURN
+# endif
+
+ .p2align 4
L(exit_tail1):
add $1, %eax
RETURN
@@ -332,7 +690,7 @@ L(exit_tail14):
L(exit_tail15):
add $15, %eax
# ifndef USE_AS_STRCAT
- ret
-END (__strlen_sse2)
+ RETURN
+END (STRLEN)
# endif
#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-c.c b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 000000000..567af2c81
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,8 @@
+#ifndef NOT_IN_libc
+# define STRNLEN __strnlen_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32);
+#endif
+
+#include "string/strnlen.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 000000000..56b6ae2a5
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen.S b/libc/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 000000000..7e542d9b7
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strnlen
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__strnlen)
+ .type __strnlen, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __strnlen_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __strnlen_sse2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
index 71cc69dfe..f46b17fd7 100644
--- a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -40,7 +40,7 @@
# define STR1 PARMS
# define STR2 STR1+4
- .text
+ atom_text_section
ENTRY (__strrchr_sse2)
ENTRANCE
diff --git a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
index 404a9a4d4..61c43c38d 100644
--- a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -21,7 +21,6 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
-# include "asm-syntax.h"
# define CFI_PUSH(REG) \
cfi_adjust_cfa_offset (4); \
@@ -34,18 +33,16 @@
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
# define POP(REG) popl REG; CFI_POP (REG)
-# ifndef STRCMP
-# define STRCMP __wcscmp_sse2
-# endif
-
# define ENTRANCE PUSH(%esi); PUSH(%edi)
# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
# define PARMS 4
# define STR1 PARMS
# define STR2 STR1+4
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
.text
-ENTRY (STRCMP)
+ENTRY (__wcscmp_sse2)
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
@@ -264,20 +261,20 @@ L(continue_00_48):
test %ecx, %ecx
jnz L(less4_double_words1)
- sub (%esi), %eax
- jnz L(return)
+ cmp (%esi), %eax
+ jne L(nequal)
mov 4(%edi), %eax
- sub 4(%esi), %eax
- jnz L(return)
+ cmp 4(%esi), %eax
+ jne L(nequal)
mov 8(%edi), %eax
- sub 8(%esi), %eax
- jnz L(return)
+ cmp 8(%esi), %eax
+ jne L(nequal)
mov 12(%edi), %eax
- sub 12(%esi), %eax
- jnz L(return)
+ cmp 12(%esi), %eax
+ jne L(nequal)
movdqu 16(%esi), %xmm2
pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
@@ -381,7 +378,7 @@ L(continue_32_48):
movdqu 48(%esi), %xmm2
pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ psubb %xmm0, %xmm1 /* packed sub of comparison results */
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
jnz L(less4_double_words_48)
@@ -585,20 +582,20 @@ L(continue_48_00):
test %ecx, %ecx
jnz L(less4_double_words1)
- sub (%esi), %eax
- jnz L(return)
+ cmp (%esi), %eax
+ jne L(nequal)
mov 4(%edi), %eax
- sub 4(%esi), %eax
- jnz L(return)
+ cmp 4(%esi), %eax
+ jne L(nequal)
mov 8(%edi), %eax
- sub 8(%esi), %eax
- jnz L(return)
+ cmp 8(%esi), %eax
+ jne L(nequal)
mov 12(%edi), %eax
- sub 12(%esi), %eax
- jnz L(return)
+ cmp 12(%esi), %eax
+ jne L(nequal)
movdqu 16(%edi), %xmm1
pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
@@ -839,142 +836,161 @@ L(less4_double_words1):
test %ecx, %ecx
jz L(equal)
- mov 12(%esi), %edx
- mov 12(%edi), %eax
- sub %edx, %eax
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ xor %eax, %eax
RETURN
.p2align 4
L(less4_double_words):
+ xor %eax, %eax
test %dl, %dl
jz L(next_two_double_words)
and $15, %dl
jz L(second_double_word)
- mov (%edi), %eax
- sub (%esi), %eax
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(second_double_word):
- mov 4(%edi), %eax
- sub 4(%esi), %eax
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(next_two_double_words):
and $15, %dh
jz L(fourth_double_word)
- mov 8(%edi), %eax
- sub 8(%esi), %eax
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(fourth_double_word):
- mov 12(%edi), %eax
- sub 12(%esi), %eax
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(less4_double_words_16):
+ xor %eax, %eax
test %dl, %dl
jz L(next_two_double_words_16)
and $15, %dl
jz L(second_double_word_16)
- mov 16(%edi), %eax
- sub 16(%esi), %eax
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(second_double_word_16):
- mov 20(%edi), %eax
- sub 20(%esi), %eax
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(next_two_double_words_16):
and $15, %dh
jz L(fourth_double_word_16)
- mov 24(%edi), %eax
- sub 24(%esi), %eax
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(fourth_double_word_16):
- mov 28(%edi), %eax
- sub 28(%esi), %eax
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(less4_double_words_32):
+ xor %eax, %eax
test %dl, %dl
jz L(next_two_double_words_32)
and $15, %dl
jz L(second_double_word_32)
- mov 32(%edi), %eax
- sub 32(%esi), %eax
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(second_double_word_32):
- mov 36(%edi), %eax
- sub 36(%esi), %eax
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(next_two_double_words_32):
and $15, %dh
jz L(fourth_double_word_32)
- mov 40(%edi), %eax
- sub 40(%esi), %eax
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(fourth_double_word_32):
- mov 44(%edi), %eax
- sub 44(%esi), %eax
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(less4_double_words_48):
+ xor %eax, %eax
test %dl, %dl
jz L(next_two_double_words_48)
and $15, %dl
jz L(second_double_word_48)
- mov 48(%edi), %eax
- sub 48(%esi), %eax
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(second_double_word_48):
- mov 52(%edi), %eax
- sub 52(%esi), %eax
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(next_two_double_words_48):
and $15, %dh
jz L(fourth_double_word_48)
- mov 56(%edi), %eax
- sub 56(%esi), %eax
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(fourth_double_word_48):
- mov 60(%edi), %eax
- sub 60(%esi), %eax
- RETURN
-
- .p2align 4
-L(return):
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
RETURN
.p2align 4
L(nequal):
mov $1, %eax
- ja L(nequal_bigger)
+ jg L(return)
neg %eax
+ RETURN
-L(nequal_bigger):
+ .p2align 4
+L(return):
RETURN
.p2align 4
@@ -988,7 +1004,7 @@ L(equal):
.p2align 4
L(neq):
mov $1, %eax
- ja L(neq_bigger)
+ jg L(neq_bigger)
neg %eax
L(neq_bigger):
@@ -999,5 +1015,5 @@ L(eq):
xorl %eax, %eax
ret
-END (STRCMP)
+END (__wcscmp_sse2)
#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-c.c b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 000000000..49f32a25e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WCSLEN __wcslen_ia32
+#endif
+
+#include "wcsmbs/wcslen.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 000000000..d41d62309
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,194 @@
+/* wcslen with SSE2
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+# define STR 4
+
+ .text
+ENTRY (__wcslen_sse2)
+ mov STR(%esp), %edx
+
+ cmp $0, (%edx)
+ jz L(exit_tail0)
+ cmp $0, 4(%edx)
+ jz L(exit_tail1)
+ cmp $0, 8(%edx)
+ jz L(exit_tail2)
+ cmp $0, 12(%edx)
+ jz L(exit_tail3)
+ cmp $0, 16(%edx)
+ jz L(exit_tail4)
+ cmp $0, 20(%edx)
+ jz L(exit_tail5)
+ cmp $0, 24(%edx)
+ jz L(exit_tail6)
+ cmp $0, 28(%edx)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%edx), %eax
+ lea 16(%edx), %ecx
+ and $-16, %eax
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%eax), %eax
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 64(%eax), %eax
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 48(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea -16(%ecx), %ecx
+ jnz L(exit)
+
+ jmp L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ shr $2, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_1)
+ ret
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_3)
+ add $2, %eax
+ ret
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ add $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail0):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_tail1):
+ mov $1, %eax
+ ret
+
+ .p2align 4
+L(exit_tail2):
+ mov $2, %eax
+ ret
+
+ .p2align 4
+L(exit_tail3):
+ mov $3, %eax
+ ret
+
+ .p2align 4
+L(exit_tail4):
+ mov $4, %eax
+ ret
+
+ .p2align 4
+L(exit_tail5):
+ mov $5, %eax
+ ret
+
+ .p2align 4
+L(exit_tail6):
+ mov $6, %eax
+ ret
+
+ .p2align 4
+L(exit_tail7):
+ mov $7, %eax
+ ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen.S b/libc/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 000000000..58670377e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,56 @@
+/* Multiple versions of wcslen
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__wcslen)
+ .type __wcslen, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wcslen_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wcslen_sse2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 000000000..94ff6151f
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 000000000..1a857c7e2
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 000000000..a41ef95fc
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 000000000..5080c14ea
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+ __i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(wmemcmp)
+#endif
diff --git a/libc/sysdeps/i386/sysdep.h b/libc/sysdeps/i386/sysdep.h
index efdc82dde..a8a9e571b 100644
--- a/libc/sysdeps/i386/sysdep.h
+++ b/libc/sysdeps/i386/sysdep.h
@@ -1,5 +1,5 @@
/* Assembler macros for i386.
- Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006
+ Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006,2011
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -167,4 +167,6 @@ __i686.get_pc_thunk.reg: \
#endif
#endif
+#define atom_text_section .section ".text.atom", "ax"
+
#endif /* __ASSEMBLER__ */