90 files changed, 5446 insertions, 819 deletions
diff --git a/libc/sysdeps/i386/bits/select.h b/libc/sysdeps/i386/bits/select.h
index ab9aa3d10..9e4c56aa8 100644
--- a/libc/sysdeps/i386/bits/select.h
+++ b/libc/sysdeps/i386/bits/select.h
@@ -48,8 +48,8 @@
 #endif	/* GNU CC */
 
 #define __FD_SET(d, set) \
-  ((void) (__FDS_BITS (set)[__FDELT (d)] |= __FDMASK (d)))
+  ((void) (__FDS_BITS (set)[__FD_ELT (d)] |= __FD_MASK (d)))
 #define __FD_CLR(d, set) \
-  ((void) (__FDS_BITS (set)[__FDELT (d)] &= ~__FDMASK (d)))
+  ((void) (__FDS_BITS (set)[__FD_ELT (d)] &= ~__FD_MASK (d)))
 #define __FD_ISSET(d, set) \
-  ((__FDS_BITS (set)[__FDELT (d)] & __FDMASK (d)) != 0)
+  ((__FDS_BITS (set)[__FD_ELT (d)] & __FD_MASK (d)) != 0)
diff --git a/libc/sysdeps/i386/configure b/libc/sysdeps/i386/configure
index e8a7970e3..cd4e627ea 100755
--- a/libc/sysdeps/i386/configure
+++ b/libc/sysdeps/i386/configure
@@ -682,6 +682,31 @@ _ACEOF
 
 fi
 
+{ $as_echo "$as_me:$LINENO: checking for FMA4 support" >&5
+$as_echo_n "checking for FMA4 support... " >&6; }
+if test "${libc_cv_cc_fma4+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  libc_cv_cc_fma4=yes
+else
+  libc_cv_cc_fma4=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_fma4" >&5
+$as_echo "$libc_cv_cc_fma4" >&6; }
+if test $libc_cv_cc_fma4 = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_FMA4_SUPPORT 1
+_ACEOF
+
+fi
+
 { $as_echo "$as_me:$LINENO: checking for -mno-vzeroupper support" >&5
 $as_echo_n "checking for -mno-vzeroupper support... " >&6; }
 if test "${libc_cv_cc_novzeroupper+set}" = set; then
diff --git a/libc/sysdeps/i386/configure.in b/libc/sysdeps/i386/configure.in
index 67fd1d7df..5a9840e16 100644
--- a/libc/sysdeps/i386/configure.in
+++ b/libc/sysdeps/i386/configure.in
@@ -67,6 +67,17 @@ if test $libc_cv_cc_avx = yes; then
   AC_DEFINE(HAVE_AVX_SUPPORT)
 fi
 
+dnl Check if -mfma4 works.
+AC_CACHE_CHECK(for FMA4 support, libc_cv_cc_fma4, [dnl
+if AC_TRY_COMMAND([${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null]); then
+  libc_cv_cc_fma4=yes
+else
+  libc_cv_cc_fma4=no
+fi])
+if test $libc_cv_cc_fma4 = yes; then
+  AC_DEFINE(HAVE_FMA4_SUPPORT)
+fi
+
 dnl Check if -mno-vzeroupper works.
 AC_CACHE_CHECK(for -mno-vzeroupper support, libc_cv_cc_novzeroupper, [dnl
 if AC_TRY_COMMAND([${CC-cc} -mno-vzeroupper -xc /dev/null -S -o /dev/null]); then
diff --git a/libc/sysdeps/i386/dl-machine.h b/libc/sysdeps/i386/dl-machine.h
index a093d2b15..9469a2b5d 100644
--- a/libc/sysdeps/i386/dl-machine.h
+++ b/libc/sysdeps/i386/dl-machine.h
@@ -1,5 +1,5 @@
 /* Machine-dependent ELF dynamic relocation inline functions.  i386 version.
-   Copyright (C) 1995-2005, 2006, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1995-2005, 2006, 2009, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -243,18 +243,12 @@ _dl_start_user:\n\
    define the value.
    ELF_RTYPE_CLASS_NOCOPY iff TYPE should not be allowed to resolve to one
    of the main executable's symbols, as for a COPY reloc.  */
-#if !defined RTLD_BOOTSTRAP || USE___THREAD
 # define elf_machine_type_class(type) \
   ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32		      \
      || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32	      \
      || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC)		      \
     * ELF_RTYPE_CLASS_PLT)						      \
    | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY))
-#else
-# define elf_machine_type_class(type) \
-  ((((type) == R_386_JMP_SLOT) * ELF_RTYPE_CLASS_PLT)			      \
-   | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY))
-#endif
 
 /* A reloc type used for ld.so cmdline arg lookups to reject PLT entries.  */
 #define ELF_MACHINE_JMP_SLOT	R_386_JMP_SLOT
@@ -311,7 +305,7 @@ auto inline void
 __attribute ((always_inline))
 elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 		 const Elf32_Sym *sym, const struct r_found_version *version,
-		 void *const reloc_addr_arg)
+		 void *const reloc_addr_arg, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = reloc_addr_arg;
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -347,7 +341,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
       if (sym != NULL
 	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
 			       0)
-	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1))
+	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
+	  && __builtin_expect (!skip_ifunc, 1))
 	value = ((Elf32_Addr (*) (void)) value) ();
 
       switch (r_type)
@@ -357,44 +352,43 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 	  *reloc_addr = value;
 	  break;
 
-# if !defined RTLD_BOOTSTRAP || USE___THREAD
 	case R_386_TLS_DTPMOD32:
-#  ifdef RTLD_BOOTSTRAP
+# ifdef RTLD_BOOTSTRAP
 	  /* During startup the dynamic linker is always the module
 	     with index 1.
 	     XXX If this relocation is necessary move before RESOLVE
 	     call.  */
 	  *reloc_addr = 1;
-#  else
+# else
 	  /* Get the information from the link map returned by the
 	     resolv function.  */
 	  if (sym_map != NULL)
 	    *reloc_addr = sym_map->l_tls_modid;
-#  endif
+# endif
 	  break;
 	case R_386_TLS_DTPOFF32:
-#  ifndef RTLD_BOOTSTRAP
+# ifndef RTLD_BOOTSTRAP
 	  /* During relocation all TLS symbols are defined and used.
 	     Therefore the offset is already correct.  */
 	  if (sym != NULL)
 	    *reloc_addr = sym->st_value;
-#  endif
+# endif
 	  break;
 	case R_386_TLS_DESC:
 	  {
 	    struct tlsdesc volatile *td =
 	      (struct tlsdesc volatile *)reloc_addr;
 
-#  ifndef RTLD_BOOTSTRAP
+# ifndef RTLD_BOOTSTRAP
 	    if (! sym)
 	      td->entry = _dl_tlsdesc_undefweak;
 	    else
-#  endif
+# endif
 	      {
-#  ifndef RTLD_BOOTSTRAP
-#   ifndef SHARED
+# ifndef RTLD_BOOTSTRAP
+#  ifndef SHARED
 		CHECK_STATIC_TLS (map, sym_map);
-#   else
+#  else
 		if (!TRY_STATIC_TLS (map, sym_map))
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
@@ -402,8 +396,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 		    td->entry = _dl_tlsdesc_dynamic;
 		  }
 		else
-#   endif
 #  endif
+# endif
 		  {
 		    td->arg = (void*)(sym->st_value - sym_map->l_tls_offset
 				      + (ElfW(Word))td->arg);
@@ -426,13 +420,13 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 	      CHECK_STATIC_TLS (map, sym_map);
 	      *reloc_addr += sym_map->l_tls_offset - sym->st_value;
 	    }
-#  endif
+# endif
 	  break;
 	case R_386_TLS_TPOFF:
 	  /* The offset is negative, forward from the thread pointer.  */
-#  ifdef RTLD_BOOTSTRAP
+# ifdef RTLD_BOOTSTRAP
 	  *reloc_addr += sym->st_value - map->l_tls_offset;
-#  else
+# else
 	  /* We know the offset of object the symbol is contained in.
 	     It is a negative value which will be added to the
 	     thread pointer.  */
@@ -441,9 +435,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
 	      CHECK_STATIC_TLS (map, sym_map);
 	      *reloc_addr += sym->st_value - sym_map->l_tls_offset;
 	    }
-#  endif
+# endif
 	  break;
-# endif	/* use TLS */
 
 # ifndef RTLD_BOOTSTRAP
 	case R_386_32:
@@ -490,7 +483,7 @@ auto inline void
 __attribute__ ((always_inline))
 elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 		  const Elf32_Sym *sym, const struct r_found_version *version,
-		  void *const reloc_addr_arg)
+		  void *const reloc_addr_arg, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = reloc_addr_arg;
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -507,8 +500,8 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc,
 
       if (sym != NULL
 	  && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)
-	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC,
-			       0))
+	  && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0)
+	  && __builtin_expect (!skip_ifunc, 1))
 	value = ((Elf32_Addr (*) (void)) value) ();
 
       switch (ELF32_R_TYPE (reloc->r_info))
@@ -655,7 +648,8 @@ elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
 auto inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map,
-		      Elf32_Addr l_addr, const Elf32_Rel *reloc)
+		      Elf32_Addr l_addr, const Elf32_Rel *reloc,
+		      int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -706,19 +700,20 @@ elf_machine_lazy_rel (struct link_map *map,
 	      ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff;
 	      elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)],
 			       &map->l_versions[ndx],
-			       (void *) (l_addr + r->r_offset));
+			       (void *) (l_addr + r->r_offset), skip_ifunc);
 	    }
 # ifndef RTLD_BOOTSTRAP
 	  else
 	    elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL,
-			     (void *) (l_addr + r->r_offset));
+			     (void *) (l_addr + r->r_offset), skip_ifunc);
 # endif
 	}
     }
   else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
     {
       Elf32_Addr value = map->l_addr + *reloc_addr;
-      value = ((Elf32_Addr (*) (void)) value) ();
+      if (__builtin_expect (!skip_ifunc, 1))
+	value = ((Elf32_Addr (*) (void)) value) ();
       *reloc_addr = value;
     }
   else
@@ -730,7 +725,8 @@ elf_machine_lazy_rel (struct link_map *map,
 auto inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rela (struct link_map *map,
-		       Elf32_Addr l_addr, const Elf32_Rela *reloc)
+		       Elf32_Addr l_addr, const Elf32_Rela *reloc,
+		       int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
@@ -747,7 +743,8 @@ elf_machine_lazy_rela (struct link_map *map,
   else if (__builtin_expect (r_type == R_386_IRELATIVE, 0))
     {
       Elf32_Addr value = map->l_addr + reloc->r_addend;
-      value = ((Elf32_Addr (*) (void)) value) ();
+      if (__builtin_expect (!skip_ifunc, 1))
+	value = ((Elf32_Addr (*) (void)) value) ();
       *reloc_addr = value;
     }
   else
diff --git a/libc/sysdeps/i386/elf/configure b/libc/sysdeps/i386/elf/configure
index 7a909d9a5..88edda0a1 100755
--- a/libc/sysdeps/i386/elf/configure
+++ b/libc/sysdeps/i386/elf/configure
@@ -1,7 +1,6 @@
 # This file is generated from configure.in by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/i386/elf.
 
-if test "$usetls" != no; then
 # Check for support of thread-local storage handling in assembler and
 # linker.
 { $as_echo "$as_me:$LINENO: checking for i386 TLS support" >&5
@@ -39,12 +38,10 @@ rm -f conftest*
 fi
 { $as_echo "$as_me:$LINENO: result: $libc_cv_386_tls" >&5
 $as_echo "$libc_cv_386_tls" >&6; }
-if test $libc_cv_386_tls = yes; then
-  cat >>confdefs.h <<\_ACEOF
-#define HAVE_TLS_SUPPORT 1
-_ACEOF
-
-fi
+if test $libc_cv_386_tls = no; then
+  { { $as_echo "$as_me:$LINENO: error: the assembler must support TLS" >&5
+$as_echo "$as_me: error: the assembler must support TLS" >&2;}
+   { (exit 1); exit 1; }; }
 fi
 
 cat >>confdefs.h <<\_ACEOF
diff --git a/libc/sysdeps/i386/elf/configure.in b/libc/sysdeps/i386/elf/configure.in
index ca607adeb..0c436f3f4 100644
--- a/libc/sysdeps/i386/elf/configure.in
+++ b/libc/sysdeps/i386/elf/configure.in
@@ -1,7 +1,6 @@
 GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
 # Local configure fragment for sysdeps/i386/elf.
 
-if test "$usetls" != no; then
 # Check for support of thread-local storage handling in assembler and
 # linker.
 AC_CACHE_CHECK(for i386 TLS support, libc_cv_386_tls, [dnl
@@ -28,9 +27,8 @@ else
   libc_cv_386_tls=no
 fi
 rm -f conftest*])
-if test $libc_cv_386_tls = yes; then
-  AC_DEFINE(HAVE_TLS_SUPPORT)
-fi
+if test $libc_cv_386_tls = no; then
+  AC_MSG_ERROR([the assembler must support TLS])
 fi
 
 dnl It is always possible to access static and hidden symbols in an
diff --git a/libc/sysdeps/i386/fpu/e_acos.S b/libc/sysdeps/i386/fpu/e_acos.S
index b9d07b109..d3505baf0 100644
--- a/libc/sysdeps/i386/fpu/e_acos.S
+++ b/libc/sysdeps/i386/fpu/e_acos.S
@@ -19,3 +19,4 @@ ENTRY(__ieee754_acos)
 	fpatan				/* atan (sqrt(1 - x^2) / x) */
 	ret
 END (__ieee754_acos)
+strong_alias (__ieee754_acos, __acos_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosf.S b/libc/sysdeps/i386/fpu/e_acosf.S
index 50b13fd1b..6a843a51d 100644
--- a/libc/sysdeps/i386/fpu/e_acosf.S
+++ b/libc/sysdeps/i386/fpu/e_acosf.S
@@ -20,3 +20,4 @@ ENTRY(__ieee754_acosf)
 	fpatan
 	ret
 END (__ieee754_acosf)
+strong_alias (__ieee754_acosf, __acosf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosh.S b/libc/sysdeps/i386/fpu/e_acosh.S
index 62a232471..fc65c295c 100644
--- a/libc/sysdeps/i386/fpu/e_acosh.S
+++ b/libc/sysdeps/i386/fpu/e_acosh.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arcsinh.
-   Copyright (C) 1996, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996, 2005, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -21,12 +21,12 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
 
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -101,3 +101,4 @@ ENTRY(__ieee754_acosh)
 	fdiv	%st, %st(0)
 	ret
 END(__ieee754_acosh)
+strong_alias (__ieee754_acosh, __acosh_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acoshf.S b/libc/sysdeps/i386/fpu/e_acoshf.S
index 1906c6057..b55004b62 100644
--- a/libc/sysdeps/i386/fpu/e_acoshf.S
+++ b/libc/sysdeps/i386/fpu/e_acoshf.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arcsinh.
-   Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -21,12 +21,12 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
 
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -101,3 +101,4 @@ ENTRY(__ieee754_acoshf)
 	fdiv	%st, %st(0)
 	ret
 END(__ieee754_acoshf)
+strong_alias (__ieee754_acoshf, __acoshf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acoshl.S b/libc/sysdeps/i386/fpu/e_acoshl.S
index c7b548d25..76bc0d752 100644
--- a/libc/sysdeps/i386/fpu/e_acoshl.S
+++ b/libc/sysdeps/i386/fpu/e_acoshl.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arcsinh.
-   Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -21,12 +21,12 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
 
-	.align ALIGNARG(4)
+	.p2align 3
 	/* Please note that we use double value for 1.0.  This number
 	   has an exact representation and so we don't get accuracy
 	   problems.  The advantage is that the code is simpler.  */
@@ -108,3 +108,4 @@ ENTRY(__ieee754_acoshl)
 	fdiv	%st, %st(0)
 	ret
 END(__ieee754_acoshl)
+strong_alias (__ieee754_acoshl, __acoshl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_acosl.c b/libc/sysdeps/i386/fpu/e_acosl.c
index 0c3e03945..ec516ffca 100644
--- a/libc/sysdeps/i386/fpu/e_acosl.c
+++ b/libc/sysdeps/i386/fpu/e_acosl.c
@@ -23,3 +23,4 @@ __ieee754_acosl (long double x)
 	: "=t" (res) : "0" (x) : "st(1)");
   return res;
 }
+strong_alias (__ieee754_acosl, __acosl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_asin.S b/libc/sysdeps/i386/fpu/e_asin.S
index 945e30824..a17e922b6 100644
--- a/libc/sysdeps/i386/fpu/e_asin.S
+++ b/libc/sysdeps/i386/fpu/e_asin.S
@@ -18,3 +18,4 @@ ENTRY(__ieee754_asin)
 	fpatan
 	ret
 END (__ieee754_asin)
+strong_alias (__ieee754_asin, __asin_finite)
diff --git a/libc/sysdeps/i386/fpu/e_asinf.S b/libc/sysdeps/i386/fpu/e_asinf.S
index d450e9a74..5c1065dd4 100644
--- a/libc/sysdeps/i386/fpu/e_asinf.S
+++ b/libc/sysdeps/i386/fpu/e_asinf.S
@@ -19,3 +19,4 @@ ENTRY(__ieee754_asinf)
 	fpatan
 	ret
 END (__ieee754_asinf)
+strong_alias (__ieee754_asinf, __asinf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2.S b/libc/sysdeps/i386/fpu/e_atan2.S
index 8df04e485..e76f8e2a7 100644
--- a/libc/sysdeps/i386/fpu/e_atan2.S
+++ b/libc/sysdeps/i386/fpu/e_atan2.S
@@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2)
 	fpatan
 	ret
 END (__ieee754_atan2)
+strong_alias (__ieee754_atan2, __atan2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2f.S b/libc/sysdeps/i386/fpu/e_atan2f.S
index fc6621f18..9ffa6373b 100644
--- a/libc/sysdeps/i386/fpu/e_atan2f.S
+++ b/libc/sysdeps/i386/fpu/e_atan2f.S
@@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2f)
 	fpatan
 	ret
 END (__ieee754_atan2f)
+strong_alias (__ieee754_atan2f, __atan2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atan2l.c b/libc/sysdeps/i386/fpu/e_atan2l.c
index 19a2a6062..9f88bfcc0 100644
--- a/libc/sysdeps/i386/fpu/e_atan2l.c
+++ b/libc/sysdeps/i386/fpu/e_atan2l.c
@@ -16,3 +16,4 @@ __ieee754_atan2l (long double y, long double x)
 
   return res;
 }
+strong_alias (__ieee754_atan2l, __atan2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanh.S b/libc/sysdeps/i386/fpu/e_atanh.S
index 3566ec6ef..d7e53a288 100644
--- a/libc/sysdeps/i386/fpu/e_atanh.S
+++ b/libc/sysdeps/i386/fpu/e_atanh.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arctanh function.
-   Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -114,3 +114,4 @@ ENTRY(__ieee754_atanh)
 6:	fldl	4(%esp)
 	ret
 END(__ieee754_atanh)
+strong_alias (__ieee754_atanh, __atanh_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanhf.S b/libc/sysdeps/i386/fpu/e_atanhf.S
index 10ce6aed9..00ad9142f 100644
--- a/libc/sysdeps/i386/fpu/e_atanhf.S
+++ b/libc/sysdeps/i386/fpu/e_atanhf.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arctanh function.
-   Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -107,3 +107,4 @@ ENTRY(__ieee754_atanhf)
 5:	flds	4(%esp)
 	ret
 END(__ieee754_atanhf)
+strong_alias (__ieee754_atanhf, __atanhf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_atanhl.S b/libc/sysdeps/i386/fpu/e_atanhl.S
index 8618c3fb3..cc70e73f4 100644
--- a/libc/sysdeps/i386/fpu/e_atanhl.S
+++ b/libc/sysdeps/i386/fpu/e_atanhl.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of arctanh function.
-   Copyright (C) 1996, 1999 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1999, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -118,3 +118,4 @@ ENTRY(__ieee754_atanhl)
 6:	fldt	4(%esp)
 	ret
 END(__ieee754_atanhl)
+strong_alias (__ieee754_atanhl, __atanhl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp.S b/libc/sysdeps/i386/fpu/e_exp.S
index 4a75fa1d1..2c331d9ed 100644
--- a/libc/sysdeps/i386/fpu/e_exp.S
+++ b/libc/sysdeps/i386/fpu/e_exp.S
@@ -5,7 +5,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_exp.S,v 1.7 1996/07/03 17:31:28 jtc Exp $")
 
 /* e^x = 2^(x * log2(e)) */
 ENTRY(__ieee754_exp)
@@ -39,3 +38,19 @@ ENTRY(__ieee754_exp)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp)
+
+
+ENTRY(__exp_finite)
+	fldl2e
+	fmull	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	ret
+END(__exp_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10.S b/libc/sysdeps/i386/fpu/e_exp10.S
index 6bfcdbb72..1e32b0784 100644
--- a/libc/sysdeps/i386/fpu/e_exp10.S
+++ b/libc/sysdeps/i386/fpu/e_exp10.S
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp10)
+strong_alias (__ieee754_exp10, __exp10_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10f.S b/libc/sysdeps/i386/fpu/e_exp10f.S
index 4791b99af..614496415 100644
--- a/libc/sysdeps/i386/fpu/e_exp10f.S
+++ b/libc/sysdeps/i386/fpu/e_exp10f.S
@@ -4,7 +4,7 @@
 
 #include <machine/asm.h>
 
-/* e^x = 2^(x * log2(10)) */
+/* 10^x = 2^(x * log2(10)) */
 ENTRY(__ieee754_exp10f)
 	flds	4(%esp)
 /* I added the following ugly construct because exp(+-Inf) resulted
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10f)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp10f)
+strong_alias (__ieee754_exp10f, __exp10f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp10l.S b/libc/sysdeps/i386/fpu/e_exp10l.S
index 71f0da792..04ec8001d 100644
--- a/libc/sysdeps/i386/fpu/e_exp10l.S
+++ b/libc/sysdeps/i386/fpu/e_exp10l.S
@@ -4,7 +4,7 @@
 
 #include <machine/asm.h>
 
-/* e^x = 2^(x * log2l(10)) */
+/* 10^x = 2^(x * log2l(10)) */
 ENTRY(__ieee754_exp10l)
 	fldt	4(%esp)
 /* I added the following ugly construct because expl(+-Inf) resulted
@@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10l)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp10l)
+strong_alias (__ieee754_exp10l, __exp10l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2.S b/libc/sysdeps/i386/fpu/e_exp2.S
index 778c0c0eb..f802cf8b9 100644
--- a/libc/sysdeps/i386/fpu/e_exp2.S
+++ b/libc/sysdeps/i386/fpu/e_exp2.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp2)
+strong_alias (__ieee754_exp2, __exp2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2f.S b/libc/sysdeps/i386/fpu/e_exp2f.S
index c2d1af1af..f867d0d47 100644
--- a/libc/sysdeps/i386/fpu/e_exp2f.S
+++ b/libc/sysdeps/i386/fpu/e_exp2f.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2f)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp2f)
+strong_alias (__ieee754_exp2f, __exp2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_exp2l.S b/libc/sysdeps/i386/fpu/e_exp2l.S
index fa1fdc923..203dd0078 100644
--- a/libc/sysdeps/i386/fpu/e_exp2l.S
+++ b/libc/sysdeps/i386/fpu/e_exp2l.S
@@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2l)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_exp2l)
+strong_alias (__ieee754_exp2l, __exp2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_expf.S b/libc/sysdeps/i386/fpu/e_expf.S
index 5fd49b89f..4e4f6a0df 100644
--- a/libc/sysdeps/i386/fpu/e_expf.S
+++ b/libc/sysdeps/i386/fpu/e_expf.S
@@ -6,7 +6,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
 
 /* e^x = 2^(x * log2(e)) */
 ENTRY(__ieee754_expf)
@@ -40,3 +39,19 @@ ENTRY(__ieee754_expf)
 	fldz				/* Set result to 0.  */
 2:	ret
 END (__ieee754_expf)
+
+
+ENTRY(__expf_finite)
+	fldl2e
+	fmuls	4(%esp)			/* x * log2(e) */
+	fld	%st
+	frndint				/* int(x * log2(e)) */
+	fsubr	%st,%st(1)		/* fract(x * log2(e)) */
+	fxch
+	f2xm1				/* 2^(fract(x * log2(e))) - 1 */
+	fld1
+	faddp				/* 2^(fract(x * log2(e))) */
+	fscale				/* e^x */
+	fstp	%st(1)
+	ret
+END(__expf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_expl.c b/libc/sysdeps/i386/fpu/e_expl.c
index 2240ceac4..8dc9581f7 100644
--- a/libc/sysdeps/i386/fpu/e_expl.c
+++ b/libc/sysdeps/i386/fpu/e_expl.c
@@ -63,7 +63,7 @@ __ieee754_expl (long double x)
        "fld1\n\t"               /* 4 1.0              */
        "faddp\n\t"		/* 3 2^(fract(x * log2(e))) */
        "fstp	%%st(1)\n\t"    /* 2  */
-       "fscale\n\t"	        /* 2 scale factor is st(1); e^x */
+       "fscale\n\t"		/* 2 scale factor is st(1); e^x */
        "fstp	%%st(1)\n\t"    /* 1  */
        "fstp	%%st(1)\n\t"    /* 0  */
        "jmp 2f\n\t"
@@ -75,3 +75,4 @@ __ieee754_expl (long double x)
        : "=t" (res) : "0" (x), "m" (c0), "m" (c1) : "ax", "dx");
   return res;
 }
+strong_alias (__ieee754_expl, __expl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmod.S b/libc/sysdeps/i386/fpu/e_fmod.S
index 4cf6e9205..26b3acc39 100644
--- a/libc/sysdeps/i386/fpu/e_fmod.S
+++ b/libc/sysdeps/i386/fpu/e_fmod.S
@@ -5,8 +5,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_fmod.S,v 1.4 1995/05/08 23:47:56 jtc Exp $")
-
 ENTRY(__ieee754_fmod)
 	fldl	12(%esp)
 	fldl	4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_fmod)
 	fstp	%st(1)
 	ret
 END (__ieee754_fmod)
+strong_alias (__ieee754_fmod, __fmod_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmodf.S b/libc/sysdeps/i386/fpu/e_fmodf.S
index bbce40976..ece4d9842 100644
--- a/libc/sysdeps/i386/fpu/e_fmodf.S
+++ b/libc/sysdeps/i386/fpu/e_fmodf.S
@@ -6,8 +6,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 ENTRY(__ieee754_fmodf)
 	flds	8(%esp)
 	flds	4(%esp)
@@ -18,3 +16,4 @@ ENTRY(__ieee754_fmodf)
 	fstp	%st(1)
 	ret
 END(__ieee754_fmodf)
+strong_alias (__ieee754_fmodf, __fmodf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_fmodl.c b/libc/sysdeps/i386/fpu/e_fmodl.c
index c7c9a6045..49700ae8f 100644
--- a/libc/sysdeps/i386/fpu/e_fmodl.c
+++ b/libc/sysdeps/i386/fpu/e_fmodl.c
@@ -20,3 +20,4 @@ __ieee754_fmodl (long double x, long double y)
        : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)");
   return res;
 }
+strong_alias (__ieee754_fmodl, __fmodl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_hypot.S b/libc/sysdeps/i386/fpu/e_hypot.S
index 043585730..0baa011d1 100644
--- a/libc/sysdeps/i386/fpu/e_hypot.S
+++ b/libc/sysdeps/i386/fpu/e_hypot.S
@@ -1,5 +1,5 @@
 /* Compute the hypothenuse of X and Y.
-   Copyright (C) 1998 Free Software Foundation, Inc.
+   Copyright (C) 1998, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
 
@@ -58,5 +58,6 @@ ENTRY(__ieee754_hypot)
 	fxch
 5:	fstp	%st(1)
 	jmp	2b
-	
+
 END(__ieee754_hypot)
+strong_alias (__ieee754_hypot, __hypot_finite)
diff --git a/libc/sysdeps/i386/fpu/e_hypotf.S b/libc/sysdeps/i386/fpu/e_hypotf.S
index 5967dae21..eb95d6ee9 100644
--- a/libc/sysdeps/i386/fpu/e_hypotf.S
+++ b/libc/sysdeps/i386/fpu/e_hypotf.S
@@ -1,5 +1,5 @@
 /* Compute the hypothenuse of X and Y.
-   Copyright (C) 1998 Free Software Foundation, Inc.
+   Copyright (C) 1998, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
 
@@ -58,5 +58,6 @@ ENTRY(__ieee754_hypotf)
 	fxch
 5:	fstp	%st(1)
 	jmp	2b
-	
+
 END(__ieee754_hypotf)
+strong_alias (__ieee754_hypotf, __hypotf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log.S b/libc/sysdeps/i386/fpu/e_log.S
index ce55b7229..a2e4d89a4 100644
--- a/libc/sysdeps/i386/fpu/e_log.S
+++ b/libc/sysdeps/i386/fpu/e_log.S
@@ -7,14 +7,12 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $")
-
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -27,9 +25,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -64,3 +62,22 @@ ENTRY(__ieee754_log)
 	fstp	%st(1)
 	ret
 END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__log_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10.S b/libc/sysdeps/i386/fpu/e_log10.S
index 525f08c96..9d24d7402 100644
--- a/libc/sysdeps/i386/fpu/e_log10.S
+++ b/libc/sysdeps/i386/fpu/e_log10.S
@@ -7,14 +7,12 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_log10.S,v 1.4 1995/05/08 23:49:24 jtc Exp $")
-
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -27,9 +25,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -64,3 +62,4 @@ ENTRY(__ieee754_log10)
 	fstp	%st(1)
 	ret
 END (__ieee754_log10)
+strong_alias (__ieee754_log10, __log10_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10f.S b/libc/sysdeps/i386/fpu/e_log10f.S
index da5069d58..38a4833d1 100644
--- a/libc/sysdeps/i386/fpu/e_log10f.S
+++ b/libc/sysdeps/i386/fpu/e_log10f.S
@@ -8,14 +8,12 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -65,3 +63,4 @@ ENTRY(__ieee754_log10f)
 	fstp	%st(1)
 	ret
 END (__ieee754_log10f)
+strong_alias (__ieee754_log10f, __log10f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log10l.S b/libc/sysdeps/i386/fpu/e_log10l.S
index 3811516be..88b309d53 100644
--- a/libc/sysdeps/i386/fpu/e_log10l.S
+++ b/libc/sysdeps/i386/fpu/e_log10l.S
@@ -9,14 +9,12 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -29,9 +27,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -66,3 +64,4 @@ ENTRY(__ieee754_log10l)
 	fstp	%st(1)
 	ret
 END(__ieee754_log10l)
+strong_alias (__ieee754_log10l, __log10l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2.S b/libc/sysdeps/i386/fpu/e_log2.S
index d80bf8023..88aee7f3c 100644
--- a/libc/sysdeps/i386/fpu/e_log2.S
+++ b/libc/sysdeps/i386/fpu/e_log2.S
@@ -9,11 +9,11 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2)
 	fstp	%st(1)
 	ret
 END (__ieee754_log2)
+strong_alias (__ieee754_log2, __log2_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2f.S b/libc/sysdeps/i386/fpu/e_log2f.S
index 9eb7b2a82..20144875f 100644
--- a/libc/sysdeps/i386/fpu/e_log2f.S
+++ b/libc/sysdeps/i386/fpu/e_log2f.S
@@ -9,11 +9,11 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2f)
 	fstp	%st(1)
 	ret
 END (__ieee754_log2f)
+strong_alias (__ieee754_log2f, __log2f_finite)
diff --git a/libc/sysdeps/i386/fpu/e_log2l.S b/libc/sysdeps/i386/fpu/e_log2l.S
index 9de08f5de..bc79dea2d 100644
--- a/libc/sysdeps/i386/fpu/e_log2l.S
+++ b/libc/sysdeps/i386/fpu/e_log2l.S
@@ -9,11 +9,11 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -26,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -63,3 +63,4 @@ ENTRY(__ieee754_log2l)
 	fstp	%st(1)
 	ret
 END (__ieee754_log2l)
+strong_alias (__ieee754_log2l, __log2l_finite)
diff --git a/libc/sysdeps/i386/fpu/e_logf.S b/libc/sysdeps/i386/fpu/e_logf.S
index cd4538b59..1992cc2f8 100644
--- a/libc/sysdeps/i386/fpu/e_logf.S
+++ b/libc/sysdeps/i386/fpu/e_logf.S
@@ -8,14 +8,12 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $")
-
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -65,3 +63,22 @@ ENTRY(__ieee754_logf)
 	fstp	%st(1)
 	ret
 END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_logl.S b/libc/sysdeps/i386/fpu/e_logl.S
index 551dcf1e4..bfb72a30e 100644
--- a/libc/sysdeps/i386/fpu/e_logl.S
+++ b/libc/sysdeps/i386/fpu/e_logl.S
@@ -7,15 +7,13 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
-	.align ALIGNARG(4)
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(one,@object)
 one:	.double 1.0
 	ASM_SIZE_DIRECTIVE(one)
@@ -28,9 +26,9 @@ limit:	.double 0.29
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%edx)
+# define MO(op) op##@GOTOFF(%edx)
 #else
-#define MO(op) op
+# define MO(op) op
 #endif
 
 	.text
@@ -65,3 +63,22 @@ ENTRY(__ieee754_logl)
 	fstp	%st(1)
 	ret
 END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fcompl	MO(limit)	// x-1 : x : log(2)
+	fnstsw			// x-1 : x : log(2)
+	andb	$0x45, %ah
+	jz	2b
+	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_pow.S b/libc/sysdeps/i386/fpu/e_pow.S
index 792f92690..dccc67752 100644
--- a/libc/sysdeps/i386/fpu/e_pow.S
+++ b/libc/sysdeps/i386/fpu/e_pow.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
+	.p2align 3
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
-	.align ALIGNARG(4)
+#ifdef __ELF__
+	.section .rodata.cst16,"aM",@progbits,16
+#else
+	.text
+#endif
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(infinity,@object)
 inf_zero:
 infinity:
@@ -43,22 +58,13 @@ minfinity:
 mzero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	ASM_SIZE_DIRECTIVE(minf_mzero)
-	ASM_TYPE_DIRECTIVE(one,@object)
-one:	.double 1.0
-	ASM_SIZE_DIRECTIVE(one)
-	ASM_TYPE_DIRECTIVE(limit,@object)
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
-	ASM_TYPE_DIRECTIVE(p63,@object)
-p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
-	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 	.text
@@ -360,3 +366,4 @@ ENTRY(__ieee754_pow)
 	ret
 
 END(__ieee754_pow)
+strong_alias (__ieee754_pow, __pow_finite)
diff --git a/libc/sysdeps/i386/fpu/e_powf.S b/libc/sysdeps/i386/fpu/e_powf.S
index c91545418..99c95bbdf 100644
--- a/libc/sysdeps/i386/fpu/e_powf.S
+++ b/libc/sysdeps/i386/fpu/e_powf.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007
+   Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007, 2011
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
+	.p2align 3
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p31,@object)
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
 
-	.align ALIGNARG(4)
+#ifdef __ELF__
+	.section .rodata.cst16,"aM",@progbits,16
+#else
+	.text
+#endif
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(infinity,@object)
 inf_zero:
 infinity:
@@ -43,22 +58,13 @@ minfinity:
 mzero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	ASM_SIZE_DIRECTIVE(minf_mzero)
-	ASM_TYPE_DIRECTIVE(one,@object)
-one:	.double 1.0
-	ASM_SIZE_DIRECTIVE(one)
-	ASM_TYPE_DIRECTIVE(limit,@object)
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
-	ASM_TYPE_DIRECTIVE(p31,@object)
-p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
-	ASM_SIZE_DIRECTIVE(p31)
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 	.text
@@ -348,3 +354,4 @@ ENTRY(__ieee754_powf)
 	ret
 
 END(__ieee754_powf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_powl.S b/libc/sysdeps/i386/fpu/e_powl.S
index 621549620..34ace3576 100644
--- a/libc/sysdeps/i386/fpu/e_powl.S
+++ b/libc/sysdeps/i386/fpu/e_powl.S
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -22,12 +22,27 @@
 #include <machine/asm.h>
 
 #ifdef __ELF__
-	.section .rodata
+	.section .rodata.cst8,"aM",@progbits,8
 #else
 	.text
 #endif
+	.p2align 3
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
-	.align ALIGNARG(4)
+#ifdef __ELF__
+	.section .rodata.cst16,"aM",@progbits,16
+#else
+	.text
+#endif
+	.p2align 3
 	ASM_TYPE_DIRECTIVE(infinity,@object)
 inf_zero:
 infinity:
@@ -43,22 +58,13 @@ minfinity:
 mzero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	ASM_SIZE_DIRECTIVE(minf_mzero)
-	ASM_TYPE_DIRECTIVE(one,@object)
-one:	.double 1.0
-	ASM_SIZE_DIRECTIVE(one)
-	ASM_TYPE_DIRECTIVE(limit,@object)
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
-	ASM_TYPE_DIRECTIVE(p63,@object)
-p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
-	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 	.text
@@ -360,3 +366,4 @@ ENTRY(__ieee754_powl)
 	ret
 
 END(__ieee754_powl)
+strong_alias (__ieee754_powl, __powl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainder.S b/libc/sysdeps/i386/fpu/e_remainder.S
index 2f43cb894..f7867aa90 100644
--- a/libc/sysdeps/i386/fpu/e_remainder.S
+++ b/libc/sysdeps/i386/fpu/e_remainder.S
@@ -5,8 +5,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_remainder.S,v 1.4 1995/05/08 23:49:37 jtc Exp $")
-
 ENTRY(__ieee754_remainder)
 	fldl	12(%esp)
 	fldl	4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_remainder)
 	fstp	%st(1)
 	ret
 END (__ieee754_remainder)
+strong_alias (__ieee754_remainder, __remainder_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainderf.S b/libc/sysdeps/i386/fpu/e_remainderf.S
index 79f821993..cfd390bc6 100644
--- a/libc/sysdeps/i386/fpu/e_remainderf.S
+++ b/libc/sysdeps/i386/fpu/e_remainderf.S
@@ -5,8 +5,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_remainderf.S,v 1.2 1995/05/08 23:49:47 jtc Exp $")
-
 ENTRY(__ieee754_remainderf)
 	flds	8(%esp)
 	flds	4(%esp)
@@ -17,3 +15,4 @@ ENTRY(__ieee754_remainderf)
 	fstp	%st(1)
 	ret
 END (__ieee754_remainderf)
+strong_alias (__ieee754_remainderf, __remainderf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_remainderl.S b/libc/sysdeps/i386/fpu/e_remainderl.S
index 5f50b626a..5ec23a37a 100644
--- a/libc/sysdeps/i386/fpu/e_remainderl.S
+++ b/libc/sysdeps/i386/fpu/e_remainderl.S
@@ -7,8 +7,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 ENTRY(__ieee754_remainderl)
 	fldt	16(%esp)
 	fldt	4(%esp)
@@ -19,3 +17,4 @@ ENTRY(__ieee754_remainderl)
 	fstp	%st(1)
 	ret
 END (__ieee754_remainderl)
+strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalb.S b/libc/sysdeps/i386/fpu/e_scalb.S
index 7e334a361..0f3ec9619 100644
--- a/libc/sysdeps/i386/fpu/e_scalb.S
+++ b/libc/sysdeps/i386/fpu/e_scalb.S
@@ -7,8 +7,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $")
-
 #ifdef __ELF__
 	.section .rodata
 #else
@@ -20,18 +18,17 @@ RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $")
 zero_nan:
 	.double 0.0
 nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
 	ASM_SIZE_DIRECTIVE(zero_nan)
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 	.text
@@ -100,3 +97,4 @@ ENTRY(__ieee754_scalb)
 	fdiv	%st
 	ret
 END(__ieee754_scalb)
+strong_alias (__ieee754_scalb, __scalb_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalbf.S b/libc/sysdeps/i386/fpu/e_scalbf.S
index e99ee92cb..d11ca66d1 100644
--- a/libc/sysdeps/i386/fpu/e_scalbf.S
+++ b/libc/sysdeps/i386/fpu/e_scalbf.S
@@ -8,8 +8,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 #ifdef __ELF__
 	.section .rodata
 #else
@@ -21,18 +19,17 @@ RCSID("$NetBSD: $")
 zero_nan:
 	.double 0.0
 nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
 	ASM_SIZE_DIRECTIVE(zero_nan)
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 
@@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbf)
 	fdiv	%st
 	ret
 END(__ieee754_scalbf)
+strong_alias (__ieee754_scalbf, __scalbf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_scalbl.S b/libc/sysdeps/i386/fpu/e_scalbl.S
index 3f67d0bef..d8b216971 100644
--- a/libc/sysdeps/i386/fpu/e_scalbl.S
+++ b/libc/sysdeps/i386/fpu/e_scalbl.S
@@ -9,8 +9,6 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: $")
-
 #ifdef __ELF__
 	.section .rodata
 #else
@@ -22,18 +20,17 @@ RCSID("$NetBSD: $")
 zero_nan:
 	.double 0.0
 nan:	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
-minus_zero:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
 	.byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f
 	ASM_SIZE_DIRECTIVE(zero_nan)
 
 
 #ifdef PIC
-#define MO(op) op##@GOTOFF(%ecx)
-#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
+# define MO(op) op##@GOTOFF(%ecx)
+# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
 #else
-#define MO(op) op
-#define MOX(op,x,f) op(,x,f)
+# define MO(op) op
+# define MOX(op,x,f) op(,x,f)
 #endif
 
 	.text
@@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbl)
 	fdiv	%st
 	ret
 END(__ieee754_scalbl)
+strong_alias (__ieee754_scalbl, __scalbl_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrt.S b/libc/sysdeps/i386/fpu/e_sqrt.S
index 6f253d51a..1054ba453 100644
--- a/libc/sysdeps/i386/fpu/e_sqrt.S
+++ b/libc/sysdeps/i386/fpu/e_sqrt.S
@@ -5,10 +5,9 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_sqrt.S,v 1.4 1995/05/08 23:49:57 jtc Exp $")
-
 ENTRY(__ieee754_sqrt)
 	fldl	4(%esp)
 	fsqrt
 	ret
 END (__ieee754_sqrt)
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrtf.S b/libc/sysdeps/i386/fpu/e_sqrtf.S
index 5ce1ad054..6f7e4b015 100644
--- a/libc/sysdeps/i386/fpu/e_sqrtf.S
+++ b/libc/sysdeps/i386/fpu/e_sqrtf.S
@@ -5,10 +5,9 @@
 
 #include <machine/asm.h>
 
-RCSID("$NetBSD: e_sqrtf.S,v 1.2 1995/05/08 23:50:14 jtc Exp $")
-
 ENTRY(__ieee754_sqrtf)
 	flds	4(%esp)
 	fsqrt
 	ret
 END (__ieee754_sqrtf)
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/libc/sysdeps/i386/fpu/e_sqrtl.c b/libc/sysdeps/i386/fpu/e_sqrtl.c
index 85f61bb38..41bcd7eeb 100644
--- a/libc/sysdeps/i386/fpu/e_sqrtl.c
+++ b/libc/sysdeps/i386/fpu/e_sqrtl.c
@@ -7,6 +7,7 @@
 
 #include <math_private.h>
 
+#undef __ieee754_sqrtl
 long double
 __ieee754_sqrtl (long double x)
 {
@@ -16,3 +17,4 @@ __ieee754_sqrtl (long double x)
 
   return res;
 }
+strong_alias (__ieee754_sqrtl, __sqrtl_finite)
diff --git a/libc/sysdeps/i386/fpu/libm-test-ulps b/libc/sysdeps/i386/fpu/libm-test-ulps
index 4b1a9e734..ebd46b0df 100644
--- a/libc/sysdeps/i386/fpu/libm-test-ulps
+++ b/libc/sysdeps/i386/fpu/libm-test-ulps
@@ -640,6 +640,52 @@ double: 1
 idouble: 1
 ildouble: 1
 ldouble: 1
+Test "jn (2, 2.4048255576957729) == 0.43175480701968038399746111312430703":
+float: 1
+ifloat: 1
+double: 1
+idouble: 1
+ldouble: 82
+ildouble: 82
+Test "jn (3, 2.4048255576957729) == 0.19899990535769083404042146764530813":
+ldouble: 186
+ildouble: 186
+Test "jn (4, 2.4048255576957729) == 0.647466661641779720084932282551219891E-1":
+ldouble: 185
+ildouble: 185
+Test: "jn (5, 2.4048255576957729) == 0.163892432048058525099230549946147698E-1":
+float: 1
+ifloat: 1
+double: 1
+idouble: 1
+ldouble: 249
+ildouble: 249
+Test "jn (6, 2.4048255576957729) == 0.34048184720278336646673682895929161E-2":
+float: 2
+ifloat: 2
+double: 1
+idouble: 1
+ldouble: 511
+ildouble: 511
+Test "jn (7, 2.4048255576957729) == 0.60068836573295394221291569249883076E-3":
+float: 2
+ifloat: 2
+double: 1
+idouble: 1
+ldouble: 428
+ildouble: 428
+Test "jn (8, 2.4048255576957729) == 0.92165786705344923232879022467054148E-4":
+float: 3
+ifloat: 3
+double: 1
+idouble: 1
+ldouble: 609
+ildouble: 609
+Test "jn (9, 2.4048255576957729) == 0.12517270977961513005428966643852564E-4":
+float: 4
+ifloat: 4
+ldouble: 750
+ildouble: 750
 
 # lgamma
 Test "lgamma (-0.5) == log(2*sqrt(pi))":
@@ -1168,11 +1214,11 @@ ldouble: 1
 
 Function: "jn":
 double: 5
-float: 2
+float: 4
 idouble: 5
-ifloat: 2
-ildouble: 2
-ldouble: 2
+ifloat: 4
+ildouble: 750
+ldouble: 750
 
 Function: "lgamma":
 double: 1
diff --git a/libc/sysdeps/i386/i686/fpu/e_log.S b/libc/sysdeps/i386/i686/fpu/e_log.S
new file mode 100644
index 000000000..73060b088
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_log.S
@@ -0,0 +1,29 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_log)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_log)
+
+ENTRY(__log_finite)
+	fldln2			// log(2)
+	fldl	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__log_finite)
diff --git a/libc/sysdeps/i386/i686/fpu/e_logf.S b/libc/sysdeps/i386/i686/fpu/e_logf.S
new file mode 100644
index 000000000..6fd39d50d
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_logf.S
@@ -0,0 +1,30 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ * Adapted for float by Ulrich Drepper <drepper@cygnus.com>.
+ *
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+
+	.text
+ENTRY(__ieee754_logf)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_logf)
+
+ENTRY(__logf_finite)
+	fldln2			// log(2)
+	flds	4(%esp)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+END(__logf_finite)
diff --git a/libc/sysdeps/i386/i686/fpu/e_logl.S b/libc/sysdeps/i386/i686/fpu/e_logl.S
new file mode 100644
index 000000000..4e79a5a4b
--- /dev/null
+++ b/libc/sysdeps/i386/i686/fpu/e_logl.S
@@ -0,0 +1,81 @@
+/*
+ * Written by J.T. Conklin <jtc@netbsd.org>.
+ * Public domain.
+ *
+ * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>.
+ * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>.
+ * Adapted for i686 instructions.
+ */
+
+#include <machine/asm.h>
+
+#ifdef __ELF__
+	.section .rodata.cst8,"aM",@progbits,8
+#else
+	.text
+#endif
+	.p2align 3
+	ASM_TYPE_DIRECTIVE(one,@object)
+one:	.double 1.0
+	ASM_SIZE_DIRECTIVE(one)
+	/* It is not important that this constant is precise.  It is only
+	   a value which is known to be on the safe side for using the
+	   fyl2xp1 instruction.  */
+	ASM_TYPE_DIRECTIVE(limit,@object)
+limit:	.double 0.29
+	ASM_SIZE_DIRECTIVE(limit)
+
+
+#ifdef PIC
+# define MO(op) op##@GOTOFF(%edx)
+#else
+# define MO(op) op
+#endif
+
+	.text
+ENTRY(__ieee754_logl)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+	fucomi	%st
+	jp	3f
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2f
+	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+
+2:	fstp	%st(0)		// x : log(2)
+	fyl2x			// log(x)
+	ret
+
+3:	fstp	%st(1)
+	ret
+END (__ieee754_logl)
+
+ENTRY(__logl_finite)
+	fldln2			// log(2)
+	fldt	4(%esp)		// x : log(2)
+#ifdef PIC
+	LOAD_PIC_REG (dx)
+#endif
+	fld	%st		// x : x : log(2)
+	fsubl	MO(one)		// x-1 : x : log(2)
+	fld	%st		// x-1 : x-1 : x : log(2)
+	fabs			// |x-1| : x-1 : x : log(2)
+	fld	MO(limit)	// 0.29 : |x-1| : x-1 : x : log(2)
+	fcomip	%st(1)		// |x-1| : x-1 : x : log(2)
+	fstp	%st(0)		// x-1 : x : log(2)
+	jc	2b
+	fstp	%st(1)		// x-1 : log(2)
+	fyl2xp1			// log(x)
+	ret
+END(__logl_finite)
diff --git a/libc/sysdeps/i386/i686/multiarch/Makefile b/libc/sysdeps/i386/i686/multiarch/Makefile
index c89ae9247..5f1853877 100644
--- a/libc/sysdeps/i386/i686/multiarch/Makefile
+++ b/libc/sysdeps/i386/i686/multiarch/Makefile
@@ -15,7 +15,11 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
 		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
 		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
-		   wcscmp-sse2 wcscmp-c
+		   wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   strnlen-sse2 strnlen-c wcslen-sse2 wcslen-c \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 000000000..115a2192a
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,497 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	add	%ecx, %edx
+	jle	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 000000000..63d1d5d7b
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,706 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+	lea	-16(%edx), %edx
+	add	%ecx, %edx
+	jle	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memchr.S b/libc/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 000000000..163a83e17
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of memchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__memchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__memchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__memchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778f1..1f5dbc15c 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,84 +20,97 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN		POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
 
 
-#ifdef SHARED
-# define JMPTBL(I, B)	I - B
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define JMPTBL(I, B)	I
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	call	__i686.get_pc_thunk.bx;	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
 	.section .text.sse4.2,"ax",@progbits
 ENTRY (MEMCMP)
 	movl	BLK1(%esp), %eax
 	movl	BLK2(%esp), %edx
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
+# endif
+
 	pxor	%xmm0, %xmm0
 	cmp	$64, %ecx
 	ja	L(64bytesormore)
 	cmp	$8, %ecx
-	PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
 	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less8bytes):
 	mov	(%eax), %bl
 	cmpb	(%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
 	mov	6(%eax), %bl
 	cmpb	6(%edx), %bl
 	je	L(0bytes)
+
 L(nonzero):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(above)
 	neg	%eax
 L(above):
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(0bytes):
-	POP (%ebx)
+	POP	(%ebx)
 	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(0bytesend)
 	movzbl	(%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
 	sub	%edx, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(0bytesend):
 	xor	%eax, %eax
 	ret
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(64bytesormore):
-	PUSH (%ebx)
+	PUSH	(%ebx)
 	mov	%ecx, %ebx
 	mov	$64, %ecx
 	sub	$64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
 L(find_16diff):
 	sub	$16, %ecx
 L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
 L(find_64diff):
 	add	%ecx, %edx
 	add	%ecx, %eax
-	jmp	L(16bytes)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(16bytes):
 	mov	-16(%eax), %ecx
 	mov	-16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(49bytes):
 	movdqu	-49(%eax), %xmm1
 	movdqu	-49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(50bytes):
 	mov	$-50, %ebx
 	movdqu	-50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(51bytes):
 	mov	$-51, %ebx
 	movdqu	-51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(52bytes):
 	movdqu	-52(%eax), %xmm1
 	movdqu	-52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(53bytes):
 	movdqu	-53(%eax), %xmm1
 	movdqu	-53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(54bytes):
 	movdqu	-54(%eax), %xmm1
 	movdqu	-54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(55bytes):
 	movdqu	-55(%eax), %xmm1
 	movdqu	-55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(56bytes):
 	movdqu	-56(%eax), %xmm1
 	movdqu	-56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
 	jnc	L(less16bytes)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(57bytes):
 	movdqu	-57(%eax), %xmm1
 	movdqu	-57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(58bytes):
 	movdqu	-58(%eax), %xmm1
 	movdqu	-58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(59bytes):
 	movdqu	-59(%eax), %xmm1
 	movdqu	-59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(60bytes):
 	movdqu	-60(%eax), %xmm1
 	movdqu	-60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(61bytes):
 	movdqu	-61(%eax), %xmm1
 	movdqu	-61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(62bytes):
 	movdqu	-62(%eax), %xmm1
 	movdqu	-62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(63bytes):
 	movdqu	-63(%eax), %xmm1
 	movdqu	-63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(64bytes):
 	movdqu	-64(%eax), %xmm1
 	movdqu	-64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
 	jnc	L(less16bytes)
 
 	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-16(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less16bytes):
 	add	%ebx, %eax
 	add	%ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
+# ifndef USE_AS_WMEMCMP
 	cmpb	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
 	jne	L(end)
 	cmp	%bx, %cx
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
 END (MEMCMP)
 
 	.section .rodata.sse4.2,"a",@progbits
-	ALIGN (2)
+	.p2align 2
 	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
 L(table_64bytes):
 	.int	JMPTBL (L(0bytes), L(table_64bytes))
 	.int	JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
 	.int	JMPTBL (L(62bytes), L(table_64bytes))
 	.int	JMPTBL (L(63bytes), L(table_64bytes))
 	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.size	L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
 #endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15fe5..eab85c1de 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,47 +20,64 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_ssse3
-#endif
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
-	.section .text.ssse3,"ax",@progbits
+	atom_text_section
 ENTRY (MEMCMP)
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
 	movl	BLK1(%esp), %eax
 	cmp	$48, %ecx
 	movl	BLK2(%esp), %edx
 	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
-	PUSH (%ebx)
+# endif
+
+	PUSH	(%ebx)
 	add	%ecx, %edx
 	add	%ecx, %eax
 	jmp	L(less48bytes)
 
-	ALIGN (4)
-	CFI_POP (%ebx)
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(zero)
 	movb	(%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
 	neg	%eax
 L(1bytesend):
 	ret
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(zero):
-	mov	$0, %eax
+	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(48bytesormore):
-	PUSH (%ebx)
-	PUSH (%esi)
-	PUSH (%edi)
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
 	cfi_remember_state
-	movdqu    (%eax), %xmm3
-	movdqu    (%edx), %xmm0
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
 	movl	%eax, %edi
 	movl	%edx, %esi
-	pcmpeqb   %xmm0, %xmm3
-	pmovmskb  %xmm3, %edx
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
 	lea	16(%edi), %edi
 
-	sub      $0xffff, %edx
+	sub	$0xffff, %edx
 	lea	16(%esi), %esi
-	jnz	  L(less16bytes)
+	jnz	L(less16bytes)
 	mov	%edi, %edx
 	and	$0xf, %edx
 	xor	%edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
 	jz	L(shr_0)
 	xor	%edx, %esi
 
+# ifndef USE_AS_WMEMCMP
 	cmp	$8, %edx
 	jae	L(next_unaligned_table)
 	cmp	$0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
 	je	L(shr_6)
 	jmp	L(shr_7)
 
-	ALIGN (4)
+	.p2align 2
 L(next_unaligned_table):
 	cmp	$8, %edx
 	je	L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
 	cmp	$14, %edx
 	je	L(shr_14)
 	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(shr_0):
 	cmp	$80, %ecx
 	jae	L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_0_gobble):
 	lea	-48(%ecx), %ecx
 	movdqa	(%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(exit):
 	pmovmskb %xmm1, %ebx
 	sub	$0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
 	lea	-16(%esi), %esi
 	lea	-16(%edi), %edi
 	mov	%ebx, %edx
+
 L(first16bytes):
 	add	%eax, %esi
 L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
 	test	%dl, %dl
 	jz	L(next_24_bytes)
 
@@ -1492,61 +1531,61 @@ L(less16bytes):
 	test	$0x40, %dl
 	jnz	L(Byte22)
 L(Byte23):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte16):
-	movzbl	 -16(%edi), %eax
-	movzbl	 -16(%esi), %edx
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte17):
-	movzbl	 -15(%edi), %eax
-	movzbl	 -15(%esi), %edx
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte18):
-	movzbl	 -14(%edi), %eax
-	movzbl	 -14(%esi), %edx
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte19):
-	movzbl	 -13(%edi), %eax
-	movzbl	 -13(%esi), %edx
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte20):
-	movzbl	 -12(%edi), %eax
-	movzbl	 -12(%esi), %edx
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte21):
-	movzbl	 -11(%edi), %eax
-	movzbl	 -11(%esi), %edx
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte22):
-	movzbl	 -10(%edi), %eax
-	movzbl	 -10(%esi), %edx
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(next_24_bytes):
 	lea	8(%edi), %edi
 	lea	8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
 	test	$0x40, %dh
 	jnz	L(Byte22)
 
-	ALIGN (4)
+	.p2align 4
 L(Byte31):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
 
 	CFI_PUSH (%ebx)
-	ALIGN (4)
+
+	.p2align 4
 L(more8bytes):
 	cmp	$16, %ecx
 	jae	L(more16bytes)
 	cmp	$8, %ecx
 	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$9, %ecx
 	je	L(9bytes)
 	cmp	$10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
 	cmp	$14, %ecx
 	je	L(14bytes)
 	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more16bytes):
 	cmp	$24, %ecx
 	jae	L(more24bytes)
 	cmp	$16, %ecx
 	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$17, %ecx
 	je	L(17bytes)
 	cmp	$18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
 	cmp	$22, %ecx
 	je	L(22bytes)
 	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more24bytes):
 	cmp	$32, %ecx
 	jae	L(more32bytes)
 	cmp	$24, %ecx
 	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$25, %ecx
 	je	L(25bytes)
 	cmp	$26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
 	cmp	$30, %ecx
 	je	L(30bytes)
 	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more32bytes):
 	cmp	$40, %ecx
 	jae	L(more40bytes)
 	cmp	$32, %ecx
 	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$33, %ecx
 	je	L(33bytes)
 	cmp	$34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
 	cmp	$38, %ecx
 	je	L(38bytes)
 	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more40bytes):
 	cmp	$40, %ecx
 	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$41, %ecx
 	je	L(41bytes)
 	cmp	$42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
 	je	L(46bytes)
 	jmp	L(47bytes)
 
-	ALIGN (4)
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-
-	ALIGN (4)
+	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
 	mov	-44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
 	cmp	%ebx, %ecx
 	mov	$0, %eax
 	jne	L(find_diff)
-	POP (%ebx)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
 L(45bytes):
 	mov	-45(%eax), %ecx
 	mov	-45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
 	cmp	-1(%edx), %cl
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(46bytes):
 	mov	-46(%eax), %ecx
 	mov	-46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
 	cmp	%bh, %ch
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(47bytes):
 	movl	-47(%eax), %ecx
 	movl	-47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
 	cmpb	-1(%edx), %al
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
 	cmpb	%bl, %cl
 	jne	L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
 	cmp	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
+
+	.p2align 4
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
 
-END (MEMCMP)
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
 
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
 #endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index f64f8d214..26471fc0e 100644
--- a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -1,5 +1,5 @@
 /* memcpy with SSSE3
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -235,7 +235,7 @@ L(shl_0_end):
 	add	%edi, %edx
 	add	%edi, %eax
 	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
 	CFI_PUSH (%edi)
 L(shl_0_gobble):
@@ -385,7 +385,7 @@ L(shl_0_mem_less_32bytes):
 L(shl_0_mem_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
-	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
 	cfi_restore_state
 	cfi_remember_state
@@ -1065,38 +1065,48 @@ L(shl_15_end):
 
 	ALIGN (4)
 L(fwd_write_44bytes):
-	movl	-44(%eax), %ecx
-	movl	%ecx, -44(%edx)
-L(fwd_write_40bytes):
-	movl	-40(%eax), %ecx
-	movl	%ecx, -40(%edx)
+	movq	-44(%eax), %xmm0
+	movq	%xmm0, -44(%edx)
 L(fwd_write_36bytes):
-	movl	-36(%eax), %ecx
-	movl	%ecx, -36(%edx)
-L(fwd_write_32bytes):
-	movl	-32(%eax), %ecx
-	movl	%ecx, -32(%edx)
+	movq	-36(%eax), %xmm0
+	movq	%xmm0, -36(%edx)
 L(fwd_write_28bytes):
-	movl	-28(%eax), %ecx
-	movl	%ecx, -28(%edx)
-L(fwd_write_24bytes):
-	movl	-24(%eax), %ecx
-	movl	%ecx, -24(%edx)
+	movq	-28(%eax), %xmm0
+	movq	%xmm0, -28(%edx)
 L(fwd_write_20bytes):
-	movl	-20(%eax), %ecx
-	movl	%ecx, -20(%edx)
-L(fwd_write_16bytes):
-	movl	-16(%eax), %ecx
-	movl	%ecx, -16(%edx)
+	movq	-20(%eax), %xmm0
+	movq	%xmm0, -20(%edx)
 L(fwd_write_12bytes):
-	movl	-12(%eax), %ecx
-	movl	%ecx, -12(%edx)
-L(fwd_write_8bytes):
-	movl	-8(%eax), %ecx
-	movl	%ecx, -8(%edx)
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
 L(fwd_write_4bytes):
 	movl	-4(%eax), %ecx
 	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_40bytes):
+	movq	-40(%eax), %xmm0
+	movq	%xmm0, -40(%edx)
+L(fwd_write_32bytes):
+	movq	-32(%eax), %xmm0
+	movq	%xmm0, -32(%edx)
+L(fwd_write_24bytes):
+	movq	-24(%eax), %xmm0
+	movq	%xmm0, -24(%edx)
+L(fwd_write_16bytes):
+	movq	-16(%eax), %xmm0
+	movq	%xmm0, -16(%edx)
+L(fwd_write_8bytes):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
 L(fwd_write_0bytes):
 #ifndef USE_AS_BCOPY
 # ifdef USE_AS_MEMPCPY
@@ -1124,37 +1134,49 @@ L(fwd_write_5bytes):
 
 	ALIGN (4)
 L(fwd_write_45bytes):
-	movl	-45(%eax), %ecx
-	movl	%ecx, -45(%edx)
-L(fwd_write_41bytes):
-	movl	-41(%eax), %ecx
-	movl	%ecx, -41(%edx)
+	movq	-45(%eax), %xmm0
+	movq	%xmm0, -45(%edx)
 L(fwd_write_37bytes):
-	movl	-37(%eax), %ecx
-	movl	%ecx, -37(%edx)
-L(fwd_write_33bytes):
-	movl	-33(%eax), %ecx
-	movl	%ecx, -33(%edx)
+	movq	-37(%eax), %xmm0
+	movq	%xmm0, -37(%edx)
 L(fwd_write_29bytes):
-	movl	-29(%eax), %ecx
-	movl	%ecx, -29(%edx)
-L(fwd_write_25bytes):
-	movl	-25(%eax), %ecx
-	movl	%ecx, -25(%edx)
+	movq	-29(%eax), %xmm0
+	movq	%xmm0, -29(%edx)
 L(fwd_write_21bytes):
-	movl	-21(%eax), %ecx
-	movl	%ecx, -21(%edx)
-L(fwd_write_17bytes):
-	movl	-17(%eax), %ecx
-	movl	%ecx, -17(%edx)
+	movq	-21(%eax), %xmm0
+	movq	%xmm0, -21(%edx)
 L(fwd_write_13bytes):
-	movl	-13(%eax), %ecx
-	movl	%ecx, -13(%edx)
-L(fwd_write_9bytes):
-	movl	-9(%eax), %ecx
-	movl	%ecx, -9(%edx)
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
 	movl	-5(%eax), %ecx
 	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_41bytes):
+	movq	-41(%eax), %xmm0
+	movq	%xmm0, -41(%edx)
+L(fwd_write_33bytes):
+	movq	-33(%eax), %xmm0
+	movq	%xmm0, -33(%edx)
+L(fwd_write_25bytes):
+	movq	-25(%eax), %xmm0
+	movq	%xmm0, -25(%edx)
+L(fwd_write_17bytes):
+	movq	-17(%eax), %xmm0
+	movq	%xmm0, -17(%edx)
+L(fwd_write_9bytes):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
 L(fwd_write_1bytes):
 	movzbl	-1(%eax), %ecx
 	movb	%cl, -1(%edx)
@@ -1169,38 +1191,50 @@ L(fwd_write_1bytes):
 
 	ALIGN (4)
 L(fwd_write_46bytes):
-	movl	-46(%eax), %ecx
-	movl	%ecx, -46(%edx)
-L(fwd_write_42bytes):
-	movl	-42(%eax), %ecx
-	movl	%ecx, -42(%edx)
+	movq	-46(%eax), %xmm0
+	movq	%xmm0, -46(%edx)
 L(fwd_write_38bytes):
-	movl	-38(%eax), %ecx
-	movl	%ecx, -38(%edx)
-L(fwd_write_34bytes):
-	movl	-34(%eax), %ecx
-	movl	%ecx, -34(%edx)
+	movq	-38(%eax), %xmm0
+	movq	%xmm0, -38(%edx)
 L(fwd_write_30bytes):
-	movl	-30(%eax), %ecx
-	movl	%ecx, -30(%edx)
-L(fwd_write_26bytes):
-	movl	-26(%eax), %ecx
-	movl	%ecx, -26(%edx)
+	movq	-30(%eax), %xmm0
+	movq	%xmm0, -30(%edx)
 L(fwd_write_22bytes):
-	movl	-22(%eax), %ecx
-	movl	%ecx, -22(%edx)
-L(fwd_write_18bytes):
-	movl	-18(%eax), %ecx
-	movl	%ecx, -18(%edx)
+	movq	-22(%eax), %xmm0
+	movq	%xmm0, -22(%edx)
 L(fwd_write_14bytes):
-	movl	-14(%eax), %ecx
-	movl	%ecx, -14(%edx)
-L(fwd_write_10bytes):
-	movl	-10(%eax), %ecx
-	movl	%ecx, -10(%edx)
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
 L(fwd_write_6bytes):
 	movl	-6(%eax), %ecx
 	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_42bytes):
+	movq	-42(%eax), %xmm0
+	movq	%xmm0, -42(%edx)
+L(fwd_write_34bytes):
+	movq	-34(%eax), %xmm0
+	movq	%xmm0, -34(%edx)
+L(fwd_write_26bytes):
+	movq	-26(%eax), %xmm0
+	movq	%xmm0, -26(%edx)
+L(fwd_write_18bytes):
+	movq	-18(%eax), %xmm0
+	movq	%xmm0, -18(%edx)
+L(fwd_write_10bytes):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
 L(fwd_write_2bytes):
 	movzwl	-2(%eax), %ecx
 	movw	%cx, -2(%edx)
@@ -1215,38 +1249,52 @@ L(fwd_write_2bytes):
 
 	ALIGN (4)
 L(fwd_write_47bytes):
-	movl	-47(%eax), %ecx
-	movl	%ecx, -47(%edx)
-L(fwd_write_43bytes):
-	movl	-43(%eax), %ecx
-	movl	%ecx, -43(%edx)
+	movq	-47(%eax), %xmm0
+	movq	%xmm0, -47(%edx)
 L(fwd_write_39bytes):
-	movl	-39(%eax), %ecx
-	movl	%ecx, -39(%edx)
-L(fwd_write_35bytes):
-	movl	-35(%eax), %ecx
-	movl	%ecx, -35(%edx)
+	movq	-39(%eax), %xmm0
+	movq	%xmm0, -39(%edx)
 L(fwd_write_31bytes):
-	movl	-31(%eax), %ecx
-	movl	%ecx, -31(%edx)
-L(fwd_write_27bytes):
-	movl	-27(%eax), %ecx
-	movl	%ecx, -27(%edx)
+	movq	-31(%eax), %xmm0
+	movq	%xmm0, -31(%edx)
 L(fwd_write_23bytes):
-	movl	-23(%eax), %ecx
-	movl	%ecx, -23(%edx)
-L(fwd_write_19bytes):
-	movl	-19(%eax), %ecx
-	movl	%ecx, -19(%edx)
+	movq	-23(%eax), %xmm0
+	movq	%xmm0, -23(%edx)
 L(fwd_write_15bytes):
-	movl	-15(%eax), %ecx
-	movl	%ecx, -15(%edx)
-L(fwd_write_11bytes):
-	movl	-11(%eax), %ecx
-	movl	%ecx, -11(%edx)
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
 L(fwd_write_7bytes):
 	movl	-7(%eax), %ecx
 	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_43bytes):
+	movq	-43(%eax), %xmm0
+	movq	%xmm0, -43(%edx)
+L(fwd_write_35bytes):
+	movq	-35(%eax), %xmm0
+	movq	%xmm0, -35(%edx)
+L(fwd_write_27bytes):
+	movq	-27(%eax), %xmm0
+	movq	%xmm0, -27(%edx)
+L(fwd_write_19bytes):
+	movq	-19(%eax), %xmm0
+	movq	%xmm0, -19(%edx)
+L(fwd_write_11bytes):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
 L(fwd_write_3bytes):
 	movzwl	-3(%eax), %ecx
 	movzbl	-1(%eax), %eax
@@ -1259,6 +1307,356 @@ L(fwd_write_3bytes):
 	movl	DEST(%esp), %eax
 # endif
 #endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_40bytes_align):
+	movdqa	-40(%eax), %xmm0
+	movdqa	%xmm0, -40(%edx)
+L(fwd_write_24bytes_align):
+	movdqa	-24(%eax), %xmm0
+	movdqa	%xmm0, -24(%edx)
+L(fwd_write_8bytes_align):
+	movq	-8(%eax), %xmm0
+	movq	%xmm0, -8(%edx)
+L(fwd_write_0bytes_align):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_32bytes_align):
+	movdqa	-32(%eax), %xmm0
+	movdqa	%xmm0, -32(%edx)
+L(fwd_write_16bytes_align):
+	movdqa	-16(%eax), %xmm0
+	movdqa	%xmm0, -16(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_5bytes_align):
+	movl	-5(%eax), %ecx
+	movl	-4(%eax), %eax
+	movl	%ecx, -5(%edx)
+	movl	%eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_45bytes_align):
+	movdqa	-45(%eax), %xmm0
+	movdqa	%xmm0, -45(%edx)
+L(fwd_write_29bytes_align):
+	movdqa	-29(%eax), %xmm0
+	movdqa	%xmm0, -29(%edx)
+L(fwd_write_13bytes_align):
+	movq	-13(%eax), %xmm0
+	movq	%xmm0, -13(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_37bytes_align):
+	movdqa	-37(%eax), %xmm0
+	movdqa	%xmm0, -37(%edx)
+L(fwd_write_21bytes_align):
+	movdqa	-21(%eax), %xmm0
+	movdqa	%xmm0, -21(%edx)
+	movl	-5(%eax), %ecx
+	movl	%ecx, -5(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_41bytes_align):
+	movdqa	-41(%eax), %xmm0
+	movdqa	%xmm0, -41(%edx)
+L(fwd_write_25bytes_align):
+	movdqa	-25(%eax), %xmm0
+	movdqa	%xmm0, -25(%edx)
+L(fwd_write_9bytes_align):
+	movq	-9(%eax), %xmm0
+	movq	%xmm0, -9(%edx)
+L(fwd_write_1bytes_align):
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_33bytes_align):
+	movdqa	-33(%eax), %xmm0
+	movdqa	%xmm0, -33(%edx)
+L(fwd_write_17bytes_align):
+	movdqa	-17(%eax), %xmm0
+	movdqa	%xmm0, -17(%edx)
+	movzbl	-1(%eax), %ecx
+	movb	%cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_46bytes_align):
+	movdqa	-46(%eax), %xmm0
+	movdqa	%xmm0, -46(%edx)
+L(fwd_write_30bytes_align):
+	movdqa	-30(%eax), %xmm0
+	movdqa	%xmm0, -30(%edx)
+L(fwd_write_14bytes_align):
+	movq	-14(%eax), %xmm0
+	movq	%xmm0, -14(%edx)
+L(fwd_write_6bytes_align):
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_38bytes_align):
+	movdqa	-38(%eax), %xmm0
+	movdqa	%xmm0, -38(%edx)
+L(fwd_write_22bytes_align):
+	movdqa	-22(%eax), %xmm0
+	movdqa	%xmm0, -22(%edx)
+	movl	-6(%eax), %ecx
+	movl	%ecx, -6(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_42bytes_align):
+	movdqa	-42(%eax), %xmm0
+	movdqa	%xmm0, -42(%edx)
+L(fwd_write_26bytes_align):
+	movdqa	-26(%eax), %xmm0
+	movdqa	%xmm0, -26(%edx)
+L(fwd_write_10bytes_align):
+	movq	-10(%eax), %xmm0
+	movq	%xmm0, -10(%edx)
+L(fwd_write_2bytes_align):
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_34bytes_align):
+	movdqa	-34(%eax), %xmm0
+	movdqa	%xmm0, -34(%edx)
+L(fwd_write_18bytes_align):
+	movdqa	-18(%eax), %xmm0
+	movdqa	%xmm0, -18(%edx)
+	movzwl	-2(%eax), %ecx
+	movw	%cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_47bytes_align):
+	movdqa	-47(%eax), %xmm0
+	movdqa	%xmm0, -47(%edx)
+L(fwd_write_31bytes_align):
+	movdqa	-31(%eax), %xmm0
+	movdqa	%xmm0, -31(%edx)
+L(fwd_write_15bytes_align):
+	movq	-15(%eax), %xmm0
+	movq	%xmm0, -15(%edx)
+L(fwd_write_7bytes_align):
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_39bytes_align):
+	movdqa	-39(%eax), %xmm0
+	movdqa	%xmm0, -39(%edx)
+L(fwd_write_23bytes_align):
+	movdqa	-23(%eax), %xmm0
+	movdqa	%xmm0, -23(%edx)
+	movl	-7(%eax), %ecx
+	movl	%ecx, -7(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_43bytes_align):
+	movdqa	-43(%eax), %xmm0
+	movdqa	%xmm0, -43(%edx)
+L(fwd_write_27bytes_align):
+	movdqa	-27(%eax), %xmm0
+	movdqa	%xmm0, -27(%edx)
+L(fwd_write_11bytes_align):
+	movq	-11(%eax), %xmm0
+	movq	%xmm0, -11(%edx)
+L(fwd_write_3bytes_align):
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_35bytes_align):
+	movdqa	-35(%eax), %xmm0
+	movdqa	%xmm0, -35(%edx)
+L(fwd_write_19bytes_align):
+	movdqa	-19(%eax), %xmm0
+	movdqa	%xmm0, -19(%edx)
+	movzwl	-3(%eax), %ecx
+	movzbl	-1(%eax), %eax
+	movw	%cx, -3(%edx)
+	movb	%al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_44bytes_align):
+	movdqa	-44(%eax), %xmm0
+	movdqa	%xmm0, -44(%edx)
+L(fwd_write_28bytes_align):
+	movdqa	-28(%eax), %xmm0
+	movdqa	%xmm0, -28(%edx)
+L(fwd_write_12bytes_align):
+	movq	-12(%eax), %xmm0
+	movq	%xmm0, -12(%edx)
+L(fwd_write_4bytes_align):
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(fwd_write_36bytes_align):
+	movdqa	-36(%eax), %xmm0
+	movdqa	%xmm0, -36(%edx)
+L(fwd_write_20bytes_align):
+	movdqa	-20(%eax), %xmm0
+	movdqa	%xmm0, -20(%edx)
+	movl	-4(%eax), %ecx
+	movl	%ecx, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+	movl	%edx, %eax
+# else
+	movl	DEST(%esp), %eax
+# endif
+#endif
 	RETURN_END
 
 	cfi_restore_state
@@ -1330,35 +1728,20 @@ L(large_page_less_32bytes):
 
 	ALIGN (4)
 L(bk_write_44bytes):
-	movl	40(%eax), %ecx
-	movl	%ecx, 40(%edx)
-L(bk_write_40bytes):
-	movl	36(%eax), %ecx
-	movl	%ecx, 36(%edx)
+	movq	36(%eax), %xmm0
+	movq	%xmm0, 36(%edx)
 L(bk_write_36bytes):
-	movl	32(%eax), %ecx
-	movl	%ecx, 32(%edx)
-L(bk_write_32bytes):
-	movl	28(%eax), %ecx
-	movl	%ecx, 28(%edx)
+	movq	28(%eax), %xmm0
+	movq	%xmm0, 28(%edx)
 L(bk_write_28bytes):
-	movl	24(%eax), %ecx
-	movl	%ecx, 24(%edx)
-L(bk_write_24bytes):
-	movl	20(%eax), %ecx
-	movl	%ecx, 20(%edx)
+	movq	20(%eax), %xmm0
+	movq	%xmm0, 20(%edx)
 L(bk_write_20bytes):
-	movl	16(%eax), %ecx
-	movl	%ecx, 16(%edx)
-L(bk_write_16bytes):
-	movl	12(%eax), %ecx
-	movl	%ecx, 12(%edx)
+	movq	12(%eax), %xmm0
+	movq	%xmm0, 12(%edx)
 L(bk_write_12bytes):
-	movl	8(%eax), %ecx
-	movl	%ecx, 8(%edx)
-L(bk_write_8bytes):
-	movl	4(%eax), %ecx
-	movl	%ecx, 4(%edx)
+	movq	4(%eax), %xmm0
+	movq	%xmm0, 4(%edx)
 L(bk_write_4bytes):
 	movl	(%eax), %ecx
 	movl	%ecx, (%edx)
@@ -1373,36 +1756,46 @@ L(bk_write_0bytes):
 	RETURN
 
 	ALIGN (4)
+L(bk_write_40bytes):
+	movq	32(%eax), %xmm0
+	movq	%xmm0, 32(%edx)
+L(bk_write_32bytes):
+	movq	24(%eax), %xmm0
+	movq	%xmm0, 24(%edx)
+L(bk_write_24bytes):
+	movq	16(%eax), %xmm0
+	movq	%xmm0, 16(%edx)
+L(bk_write_16bytes):
+	movq	8(%eax), %xmm0
+	movq	%xmm0, 8(%edx)
+L(bk_write_8bytes):
+	movq	(%eax), %xmm0
+	movq	%xmm0, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
 L(bk_write_45bytes):
-	movl	41(%eax), %ecx
-	movl	%ecx, 41(%edx)
-L(bk_write_41bytes):
-	movl	37(%eax), %ecx
-	movl	%ecx, 37(%edx)
+	movq	37(%eax), %xmm0
+	movq	%xmm0, 37(%edx)
 L(bk_write_37bytes):
-	movl	33(%eax), %ecx
-	movl	%ecx, 33(%edx)
-L(bk_write_33bytes):
-	movl	29(%eax), %ecx
-	movl	%ecx, 29(%edx)
+	movq	29(%eax), %xmm0
+	movq	%xmm0, 29(%edx)
 L(bk_write_29bytes):
-	movl	25(%eax), %ecx
-	movl	%ecx, 25(%edx)
-L(bk_write_25bytes):
-	movl	21(%eax), %ecx
-	movl	%ecx, 21(%edx)
+	movq	21(%eax), %xmm0
+	movq	%xmm0, 21(%edx)
 L(bk_write_21bytes):
-	movl	17(%eax), %ecx
-	movl	%ecx, 17(%edx)
-L(bk_write_17bytes):
-	movl	13(%eax), %ecx
-	movl	%ecx, 13(%edx)
+	movq	13(%eax), %xmm0
+	movq	%xmm0, 13(%edx)
 L(bk_write_13bytes):
-	movl	9(%eax), %ecx
-	movl	%ecx, 9(%edx)
-L(bk_write_9bytes):
-	movl	5(%eax), %ecx
-	movl	%ecx, 5(%edx)
+	movq	5(%eax), %xmm0
+	movq	%xmm0, 5(%edx)
 L(bk_write_5bytes):
 	movl	1(%eax), %ecx
 	movl	%ecx, 1(%edx)
@@ -1419,39 +1812,78 @@ L(bk_write_1bytes):
 	RETURN
 
 	ALIGN (4)
+L(bk_write_41bytes):
+	movq	33(%eax), %xmm0
+	movq	%xmm0, 33(%edx)
+L(bk_write_33bytes):
+	movq	25(%eax), %xmm0
+	movq	%xmm0, 25(%edx)
+L(bk_write_25bytes):
+	movq	17(%eax), %xmm0
+	movq	%xmm0, 17(%edx)
+L(bk_write_17bytes):
+	movq	9(%eax), %xmm0
+	movq	%xmm0, 9(%edx)
+L(bk_write_9bytes):
+	movq	1(%eax), %xmm0
+	movq	%xmm0, 1(%edx)
+	movzbl	(%eax), %ecx
+	movb	%cl, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
 L(bk_write_46bytes):
-	movl	42(%eax), %ecx
-	movl	%ecx, 42(%edx)
-L(bk_write_42bytes):
-	movl	38(%eax), %ecx
-	movl	%ecx, 38(%edx)
+	movq	38(%eax), %xmm0
+	movq	%xmm0, 38(%edx)
 L(bk_write_38bytes):
-	movl	34(%eax), %ecx
-	movl	%ecx, 34(%edx)
-L(bk_write_34bytes):
-	movl	30(%eax), %ecx
-	movl	%ecx, 30(%edx)
+	movq	30(%eax), %xmm0
+	movq	%xmm0, 30(%edx)
 L(bk_write_30bytes):
-	movl	26(%eax), %ecx
-	movl	%ecx, 26(%edx)
-L(bk_write_26bytes):
-	movl	22(%eax), %ecx
-	movl	%ecx, 22(%edx)
+	movq	22(%eax), %xmm0
+	movq	%xmm0, 22(%edx)
 L(bk_write_22bytes):
-	movl	18(%eax), %ecx
-	movl	%ecx, 18(%edx)
-L(bk_write_18bytes):
-	movl	14(%eax), %ecx
-	movl	%ecx, 14(%edx)
+	movq	14(%eax), %xmm0
+	movq	%xmm0, 14(%edx)
 L(bk_write_14bytes):
-	movl	10(%eax), %ecx
-	movl	%ecx, 10(%edx)
-L(bk_write_10bytes):
-	movl	6(%eax), %ecx
-	movl	%ecx, 6(%edx)
+	movq	6(%eax), %xmm0
+	movq	%xmm0, 6(%edx)
 L(bk_write_6bytes):
 	movl	2(%eax), %ecx
 	movl	%ecx, 2(%edx)
+	movzwl	(%eax), %ecx
+	movw	%cx, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_42bytes):
+	movq	34(%eax), %xmm0
+	movq	%xmm0, 34(%edx)
+L(bk_write_34bytes):
+	movq	26(%eax), %xmm0
+	movq	%xmm0, 26(%edx)
+L(bk_write_26bytes):
+	movq	18(%eax), %xmm0
+	movq	%xmm0, 18(%edx)
+L(bk_write_18bytes):
+	movq	10(%eax), %xmm0
+	movq	%xmm0, 10(%edx)
+L(bk_write_10bytes):
+	movq	2(%eax), %xmm0
+	movq	%xmm0, 2(%edx)
 L(bk_write_2bytes):
 	movzwl	(%eax), %ecx
 	movw	%cx, (%edx)
@@ -1466,38 +1898,52 @@ L(bk_write_2bytes):
 
 	ALIGN (4)
 L(bk_write_47bytes):
-	movl	43(%eax), %ecx
-	movl	%ecx, 43(%edx)
-L(bk_write_43bytes):
-	movl	39(%eax), %ecx
-	movl	%ecx, 39(%edx)
+	movq	39(%eax), %xmm0
+	movq	%xmm0, 39(%edx)
 L(bk_write_39bytes):
-	movl	35(%eax), %ecx
-	movl	%ecx, 35(%edx)
-L(bk_write_35bytes):
-	movl	31(%eax), %ecx
-	movl	%ecx, 31(%edx)
+	movq	31(%eax), %xmm0
+	movq	%xmm0, 31(%edx)
 L(bk_write_31bytes):
-	movl	27(%eax), %ecx
-	movl	%ecx, 27(%edx)
-L(bk_write_27bytes):
-	movl	23(%eax), %ecx
-	movl	%ecx, 23(%edx)
+	movq	23(%eax), %xmm0
+	movq	%xmm0, 23(%edx)
 L(bk_write_23bytes):
-	movl	19(%eax), %ecx
-	movl	%ecx, 19(%edx)
-L(bk_write_19bytes):
-	movl	15(%eax), %ecx
-	movl	%ecx, 15(%edx)
+	movq	15(%eax), %xmm0
+	movq	%xmm0, 15(%edx)
 L(bk_write_15bytes):
-	movl	11(%eax), %ecx
-	movl	%ecx, 11(%edx)
-L(bk_write_11bytes):
-	movl	7(%eax), %ecx
-	movl	%ecx, 7(%edx)
+	movq	7(%eax), %xmm0
+	movq	%xmm0, 7(%edx)
 L(bk_write_7bytes):
 	movl	3(%eax), %ecx
 	movl	%ecx, 3(%edx)
+	movzwl	1(%eax), %ecx
+	movw	%cx, 1(%edx)
+	movzbl	(%eax), %eax
+	movb	%al, (%edx)
+#ifndef USE_AS_BCOPY
+	movl	DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+	movl	LEN(%esp), %ecx
+	add	%ecx, %eax
+# endif
+#endif
+	RETURN
+
+	ALIGN (4)
+L(bk_write_43bytes):
+	movq	35(%eax), %xmm0
+	movq	%xmm0, 35(%edx)
+L(bk_write_35bytes):
+	movq	27(%eax), %xmm0
+	movq	%xmm0, 27(%edx)
+L(bk_write_27bytes):
+	movq	19(%eax), %xmm0
+	movq	%xmm0, 19(%edx)
+L(bk_write_19bytes):
+	movq	11(%eax), %xmm0
+	movq	%xmm0, 11(%edx)
+L(bk_write_11bytes):
+	movq	3(%eax), %xmm0
+	movq	%xmm0, 3(%edx)
 L(bk_write_3bytes):
 	movzwl	1(%eax), %ecx
 	movw	%cx, 1(%edx)
@@ -1566,6 +2012,57 @@ L(table_48bytes_fwd):
 	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
 
 	ALIGN (2)
+L(table_48bytes_fwd_align):
+	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
+	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
+
+	ALIGN (2)
 L(shl_table):
 	.int	JMPTBL (L(shl_0), L(shl_table))
 	.int	JMPTBL (L(shl_1), L(shl_table))
@@ -1658,22 +2155,14 @@ L(bk_write_64bytesless):
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
 	sub	$32, %ecx
-	movl	-4(%esi), %eax
-	movl	%eax, -4(%edx)
-	movl	-8(%esi), %eax
-	movl	%eax, -8(%edx)
-	movl	-12(%esi), %eax
-	movl	%eax, -12(%edx)
-	movl	-16(%esi), %eax
-	movl	%eax, -16(%edx)
-	movl	-20(%esi), %eax
-	movl	%eax, -20(%edx)
-	movl	-24(%esi), %eax
-	movl	%eax, -24(%edx)
-	movl	-28(%esi), %eax
-	movl	%eax, -28(%edx)
-	movl	-32(%esi), %eax
-	movl	%eax, -32(%edx)
+	movq	-8(%esi), %xmm0
+	movq	%xmm0, -8(%edx)
+	movq	-16(%esi), %xmm0
+	movq	%xmm0, -16(%edx)
+	movq	-24(%esi), %xmm0
+	movq	%xmm0, -24(%edx)
+	movq	-32(%esi), %xmm0
+	movq	%xmm0, -32(%edx)
 	sub	$32, %edx
 	sub	$32, %esi
 
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-c.c b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 000000000..44ec1a6ed
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#ifndef NOT_IN_libc
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 000000000..355d498e2
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,418 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 000000000..86a0cf961
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,725 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr.S b/libc/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 000000000..8e5b2c50a
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,79 @@
+/* Multiple versions of memrchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__memrchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__memrchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__memrchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 000000000..88c0e5776
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 000000000..038c74896
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 000000000..111f0dcf6
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of rawmemchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__rawmemchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__rawmemchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c
new file mode 100644
index 000000000..1aa544064
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c
@@ -0,0 +1 @@
+#include <string/strnlen.c>
diff --git a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
index a73b21ecc..9cc5ae8d1 100644
--- a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S
@@ -40,7 +40,7 @@
 # define STR1  PARMS
 # define STR2  STR1+4
 
-	.text
+	atom_text_section
 ENTRY (__strchr_sse2)
 
 	ENTRANCE
diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
index 0dc651f01..ce50e0a33 100644
--- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
+++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S
@@ -1,5 +1,5 @@
 /* strlen with SSE2 and BSF
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -21,7 +21,6 @@
 #if defined SHARED && !defined NOT_IN_libc
 
 #include <sysdep.h>
-#include "asm-syntax.h"
 
 #define CFI_PUSH(REG)						\
   cfi_adjust_cfa_offset (4);					\
diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
index ca549bafc..91b6d799c 100644
--- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S
@@ -18,31 +18,46 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#if (defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc
+/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
+
+#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc
+
 # ifndef USE_AS_STRCAT
 
 #  include <sysdep.h>
-#  include "asm-syntax.h"
+#  define PARMS	4
+#  define STR	PARMS
+#  define RETURN	ret
 
-#  define CFI_PUSH(REG)	\
+#  ifdef USE_AS_STRNLEN
+#   define LEN	PARMS + 8
+#   define CFI_PUSH(REG)	\
 	cfi_adjust_cfa_offset (4);	\
 	cfi_rel_offset (REG, 0)
 
-#  define CFI_POP(REG)	\
+#   define CFI_POP(REG)	\
 	cfi_adjust_cfa_offset (-4);	\
 	cfi_restore (REG)
 
-#  define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#  define POP(REG)	popl REG; CFI_POP (REG)
-#  define PARMS		4
-#  define STR		PARMS
-#  define ENTRANCE
-#  define RETURN		ret
+#   define PUSH(REG)	pushl	REG;	CFI_PUSH (REG)
+#   define POP(REG)	popl	REG;	CFI_POP (REG)
+#   undef RETURN
+#   define RETURN	POP (%edi); CFI_PUSH(%edi); ret
+#  endif
 
-	.text
-ENTRY (__strlen_sse2)
-	ENTRANCE
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
 	mov	STR(%esp), %edx
+#  ifdef USE_AS_STRNLEN
+	PUSH	(%edi)
+	movl	LEN(%esp), %edi
+	sub	$4, %edi
+	jbe	L(len_less4_prolog)
+#  endif
 # endif
 	xor	%eax, %eax
 	cmpb	$0, (%edx)
@@ -53,6 +68,12 @@ ENTRY (__strlen_sse2)
 	jz	L(exit_tail2)
 	cmpb	$0, 3(%edx)
 	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less8_prolog)
+# endif
+
 	cmpb	$0, 4(%edx)
 	jz	L(exit_tail4)
 	cmpb	$0, 5(%edx)
@@ -61,6 +82,12 @@ ENTRY (__strlen_sse2)
 	jz	L(exit_tail6)
 	cmpb	$0, 7(%edx)
 	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less12_prolog)
+# endif
+
 	cmpb	$0, 8(%edx)
 	jz	L(exit_tail8)
 	cmpb	$0, 9(%edx)
@@ -69,6 +96,12 @@ ENTRY (__strlen_sse2)
 	jz	L(exit_tail10)
 	cmpb	$0, 11(%edx)
 	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %edi
+	jbe	L(len_less16_prolog)
+# endif
+
 	cmpb	$0, 12(%edx)
 	jz	L(exit_tail12)
 	cmpb	$0, 13(%edx)
@@ -77,11 +110,18 @@ ENTRY (__strlen_sse2)
 	jz	L(exit_tail14)
 	cmpb	$0, 15(%edx)
 	jz	L(exit_tail15)
+
 	pxor	%xmm0, %xmm0
-	mov	%edx, %eax
-	lea	16(%edx), %ecx
+	lea	16(%edx), %eax
+	mov	%eax, %ecx
 	and	$-16, %eax
-	add	$16, %eax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %edx
+	add	%edx, %edi
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
 
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
@@ -97,7 +137,6 @@ ENTRY (__strlen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
-
 	pcmpeqb	(%eax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
@@ -111,6 +150,11 @@ ENTRY (__strlen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
@@ -135,6 +179,11 @@ ENTRY (__strlen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
@@ -159,6 +208,11 @@ ENTRY (__strlen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
+
 	pcmpeqb	(%eax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
@@ -183,8 +237,20 @@ ENTRY (__strlen_sse2)
 	lea	16(%eax), %eax
 	jnz	L(exit)
 
+# ifdef USE_AS_STRNLEN
+	mov	%eax, %edx
+	and	$63, %edx
+	add	%edx, %edi
+# endif
+
 	and	$-0x40, %eax
-L(aligned_64):
+
+	.p2align 4
+L(aligned_64_loop):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %edi
+	jbe	L(len_less64)
+# endif
 	movaps	(%eax), %xmm0
 	movaps	16(%eax), %xmm1
 	movaps	32(%eax), %xmm2
@@ -196,7 +262,7 @@ L(aligned_64):
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	lea	64(%eax), %eax
-	jz	L(aligned_64)
+	jz	L(aligned_64_loop)
 
 	pcmpeqb	-64(%eax), %xmm3
 	pmovmskb %xmm3, %edx
@@ -223,56 +289,348 @@ L(exit):
 	sub	%ecx, %eax
 	test	%dl, %dl
 	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_8)
 	test	$0x01, %dl
 	jnz	L(exit_tail0)
-
 	test	$0x02, %dl
 	jnz	L(exit_tail1)
-
 	test	$0x04, %dl
 	jnz	L(exit_tail2)
+	add	$3, %eax
+	RETURN
 
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
+	.p2align 4
+L(exit_8):
 	test	$0x10, %dl
 	jnz	L(exit_tail4)
-
 	test	$0x20, %dl
 	jnz	L(exit_tail5)
-
 	test	$0x40, %dl
 	jnz	L(exit_tail6)
 	add	$7, %eax
-L(exit_tail0):
 	RETURN
 
+	.p2align 4
 L(exit_high):
-	add	$8, %eax
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_high_8)
 	test	$0x01, %dh
+	jnz	L(exit_tail8)
+	test	$0x02, %dh
+	jnz	L(exit_tail9)
+	test	$0x04, %dh
+	jnz	L(exit_tail10)
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_high_8):
+	test	$0x10, %dh
+	jnz	L(exit_tail12)
+	test	$0x20, %dh
+	jnz	L(exit_tail13)
+	test	$0x40, %dh
+	jnz	L(exit_tail14)
+	add	$15, %eax
+L(exit_tail0):
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %edi
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %edi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%eax), %eax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	movl	LEN(%esp), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%ecx, %eax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
 	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %edi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	RETURN
 
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %edi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	RETURN
 
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
 	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %edi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	RETURN
 
+	.p2align 4
+L(strnlen_exit_high_8):
 	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
+	jnz	L(strnlen_exit_tail12)
 	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
+	jnz	L(strnlen_exit_tail13)
 	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %edi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %edi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %edi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %edi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %edi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %edi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %edi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %edi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %edi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %edi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %edi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %edi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	RETURN
+
+	.p2align 4
+L(return_start_len):
+	movl	LEN(%esp), %eax
+	RETURN
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	xor	%eax, %eax
+
+	add	$4, %edi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$1, %edi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%edx)
+	jz	L(exit_tail1)
+	cmp	$2, %edi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%edx)
+	jz	L(exit_tail2)
+	cmp	$3, %edi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%edx)
+	jz	L(exit_tail3)
+	mov	$4, %eax
 	RETURN
 
 	.p2align 4
+L(len_less8_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 4(%edx)
+	jz	L(exit_tail4)
+	cmp	$1, %edi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%edx)
+	jz	L(exit_tail5)
+	cmp	$2, %edi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%edx)
+	jz	L(exit_tail6)
+	cmp	$3, %edi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%edx)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	RETURN
+
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 8(%edx)
+	jz	L(exit_tail8)
+	cmp	$1, %edi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%edx)
+	jz	L(exit_tail9)
+	cmp	$2, %edi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%edx)
+	jz	L(exit_tail10)
+	cmp	$3, %edi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%edx)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	RETURN
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %edi
+
+	cmpb	$0, 12(%edx)
+	jz	L(exit_tail12)
+	cmp	$1, %edi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%edx)
+	jz	L(exit_tail13)
+	cmp	$2, %edi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%edx)
+	jz	L(exit_tail14)
+	cmp	$3, %edi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%edx)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	RETURN
+# endif
+
+	.p2align 4
 L(exit_tail1):
 	add	$1, %eax
 	RETURN
@@ -332,7 +690,7 @@ L(exit_tail14):
 L(exit_tail15):
 	add	$15, %eax
 # ifndef USE_AS_STRCAT
-	ret
-END (__strlen_sse2)
+	RETURN
+END (STRLEN)
 # endif
 #endif
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-c.c b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c
new file mode 100644
index 000000000..567af2c81
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c
@@ -0,0 +1,8 @@
+#ifndef NOT_IN_libc
+# define STRNLEN  __strnlen_ia32
+# undef libc_hidden_builtin_def
+# define libc_hidden_def(name)  \
+    __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32);
+#endif
+
+#include "string/strnlen.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S
new file mode 100644
index 000000000..56b6ae2a5
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2
+#include "strlen-sse2.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen.S b/libc/sysdeps/i386/i686/multiarch/strnlen.S
new file mode 100644
index 000000000..7e542d9b7
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/strnlen.S
@@ -0,0 +1,56 @@
+/* Multiple versions of strnlen
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__strnlen_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__strnlen_sse2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(__strnlen)
+
+weak_alias(__strnlen, strnlen)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
index 71cc69dfe..f46b17fd7 100644
--- a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S
@@ -40,7 +40,7 @@
 # define STR1  PARMS
 # define STR2  STR1+4
 
-	.text
+	atom_text_section
 ENTRY (__strrchr_sse2)
 
 	ENTRANCE
diff --git a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
index 404a9a4d4..61c43c38d 100644
--- a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
+++ b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S
@@ -21,7 +21,6 @@
 #ifndef NOT_IN_libc
 
 # include <sysdep.h>
-# include "asm-syntax.h"
 
 # define CFI_PUSH(REG)	\
 	cfi_adjust_cfa_offset (4);	\
@@ -34,18 +33,16 @@
 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
 # define POP(REG) popl REG; CFI_POP (REG)
 
-# ifndef STRCMP
-# define STRCMP __wcscmp_sse2
-# endif
-
 # define ENTRANCE PUSH(%esi); PUSH(%edi)
 # define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
 # define PARMS  4
 # define STR1  PARMS
 # define STR2  STR1+4
 
+/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */
+
 	.text
-ENTRY (STRCMP)
+ENTRY (__wcscmp_sse2)
 /*
 	* This implementation uses SSE to compare up to 16 bytes at a time.
 */
@@ -264,20 +261,20 @@ L(continue_00_48):
 	test	%ecx, %ecx
 	jnz	L(less4_double_words1)
 
-	sub	(%esi), %eax
-	jnz	L(return)
+	cmp	(%esi), %eax
+	jne	L(nequal)
 
 	mov	4(%edi), %eax
-	sub	4(%esi), %eax
-	jnz	L(return)
+	cmp	4(%esi), %eax
+	jne	L(nequal)
 
 	mov	8(%edi), %eax
-	sub	8(%esi), %eax
-	jnz	L(return)
+	cmp	8(%esi), %eax
+	jne	L(nequal)
 
 	mov	12(%edi), %eax
-	sub	12(%esi), %eax
-	jnz	L(return)
+	cmp	12(%esi), %eax
+	jne	L(nequal)
 
 	movdqu	16(%esi), %xmm2
 	pcmpeqd	%xmm2, %xmm0		/* Any null double_word? */
@@ -381,7 +378,7 @@ L(continue_32_48):
 	movdqu	48(%esi), %xmm2
 	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
 	pcmpeqd	%xmm2, %xmm1		/* compare first 4 double_words for equality */
-	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results */
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 4 double_words are same, edx == 0xffff */
 	jnz	L(less4_double_words_48)
@@ -585,20 +582,20 @@ L(continue_48_00):
 	test	%ecx, %ecx
 	jnz	L(less4_double_words1)
 
-	sub	(%esi), %eax
-	jnz	L(return)
+	cmp	(%esi), %eax
+	jne	L(nequal)
 
 	mov	4(%edi), %eax
-	sub	4(%esi), %eax
-	jnz	L(return)
+	cmp	4(%esi), %eax
+	jne	L(nequal)
 
 	mov	8(%edi), %eax
-	sub	8(%esi), %eax
-	jnz	L(return)
+	cmp	8(%esi), %eax
+	jne	L(nequal)
 
 	mov	12(%edi), %eax
-	sub	12(%esi), %eax
-	jnz	L(return)
+	cmp	12(%esi), %eax
+	jne	L(nequal)
 
 	movdqu	16(%edi), %xmm1
 	pcmpeqd	%xmm1, %xmm0		/* Any null double_word? */
@@ -839,142 +836,161 @@ L(less4_double_words1):
 	test	%ecx, %ecx
 	jz	L(equal)
 
-	mov	12(%esi), %edx
-	mov	12(%edi), %eax
-	sub	%edx, %eax
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
+	xor	%eax, %eax
 	RETURN
 
 	.p2align 4
 L(less4_double_words):
+	xor	%eax, %eax
 	test	%dl, %dl
 	jz	L(next_two_double_words)
 	and	$15, %dl
 	jz	L(second_double_word)
-	mov	(%edi), %eax
-	sub	(%esi), %eax
+	mov	(%esi), %ecx
+	cmp	%ecx, (%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(second_double_word):
-	mov	4(%edi), %eax
-	sub	4(%esi), %eax
+	mov	4(%esi), %ecx
+	cmp	%ecx, 4(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(next_two_double_words):
 	and	$15, %dh
 	jz	L(fourth_double_word)
-	mov	8(%edi), %eax
-	sub	8(%esi), %eax
+	mov	8(%esi), %ecx
+	cmp	%ecx, 8(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(fourth_double_word):
-	mov	12(%edi), %eax
-	sub	12(%esi), %eax
+	mov	12(%esi), %ecx
+	cmp	%ecx, 12(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(less4_double_words_16):
+	xor	%eax, %eax
 	test	%dl, %dl
 	jz	L(next_two_double_words_16)
 	and	$15, %dl
 	jz	L(second_double_word_16)
-	mov	16(%edi), %eax
-	sub	16(%esi), %eax
+	mov	16(%esi), %ecx
+	cmp	%ecx, 16(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(second_double_word_16):
-	mov	20(%edi), %eax
-	sub	20(%esi), %eax
+	mov	20(%esi), %ecx
+	cmp	%ecx, 20(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(next_two_double_words_16):
 	and	$15, %dh
 	jz	L(fourth_double_word_16)
-	mov	24(%edi), %eax
-	sub	24(%esi), %eax
+	mov	24(%esi), %ecx
+	cmp	%ecx, 24(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(fourth_double_word_16):
-	mov	28(%edi), %eax
-	sub	28(%esi), %eax
+	mov	28(%esi), %ecx
+	cmp	%ecx, 28(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(less4_double_words_32):
+	xor	%eax, %eax
 	test	%dl, %dl
 	jz	L(next_two_double_words_32)
 	and	$15, %dl
 	jz	L(second_double_word_32)
-	mov	32(%edi), %eax
-	sub	32(%esi), %eax
+	mov	32(%esi), %ecx
+	cmp	%ecx, 32(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(second_double_word_32):
-	mov	36(%edi), %eax
-	sub	36(%esi), %eax
+	mov	36(%esi), %ecx
+	cmp	%ecx, 36(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(next_two_double_words_32):
 	and	$15, %dh
 	jz	L(fourth_double_word_32)
-	mov	40(%edi), %eax
-	sub	40(%esi), %eax
+	mov	40(%esi), %ecx
+	cmp	%ecx, 40(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(fourth_double_word_32):
-	mov	44(%edi), %eax
-	sub	44(%esi), %eax
+	mov	44(%esi), %ecx
+	cmp	%ecx, 44(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(less4_double_words_48):
+	xor	%eax, %eax
 	test	%dl, %dl
 	jz	L(next_two_double_words_48)
 	and	$15, %dl
 	jz	L(second_double_word_48)
-	mov	48(%edi), %eax
-	sub	48(%esi), %eax
+	mov	48(%esi), %ecx
+	cmp	%ecx, 48(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(second_double_word_48):
-	mov	52(%edi), %eax
-	sub	52(%esi), %eax
+	mov	52(%esi), %ecx
+	cmp	%ecx, 52(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(next_two_double_words_48):
 	and	$15, %dh
 	jz	L(fourth_double_word_48)
-	mov	56(%edi), %eax
-	sub	56(%esi), %eax
+	mov	56(%esi), %ecx
+	cmp	%ecx, 56(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(fourth_double_word_48):
-	mov	60(%edi), %eax
-	sub	60(%esi), %eax
-	RETURN
-
-	.p2align 4
-L(return):
+	mov	60(%esi), %ecx
+	cmp	%ecx, 60(%edi)
+	jne	L(nequal)
 	RETURN
 
 	.p2align 4
 L(nequal):
 	mov	$1, %eax
-	ja	L(nequal_bigger)
+	jg	L(return)
 	neg	%eax
+	RETURN
 
-L(nequal_bigger):
+	.p2align 4
+L(return):
 	RETURN
 
 	.p2align 4
@@ -988,7 +1004,7 @@ L(equal):
 	.p2align 4
 L(neq):
 	mov	$1, %eax
-	ja	L(neq_bigger)
+	jg	L(neq_bigger)
 	neg	%eax
 
 L(neq_bigger):
@@ -999,5 +1015,5 @@ L(eq):
 	xorl	%eax, %eax
 	ret
 
-END (STRCMP)
+END (__wcscmp_sse2)
 #endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-c.c b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c
new file mode 100644
index 000000000..49f32a25e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WCSLEN  __wcslen_ia32
+#endif
+
+#include "wcsmbs/wcslen.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S
new file mode 100644
index 000000000..d41d62309
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S
@@ -0,0 +1,194 @@
+/* wcslen with SSE2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+# define STR	4
+
+	.text
+ENTRY (__wcslen_sse2)
+	mov	STR(%esp), %edx
+
+	cmp	$0, (%edx)
+	jz	L(exit_tail0)
+	cmp	$0, 4(%edx)
+	jz	L(exit_tail1)
+	cmp	$0, 8(%edx)
+	jz	L(exit_tail2)
+	cmp	$0, 12(%edx)
+	jz	L(exit_tail3)
+	cmp	$0, 16(%edx)
+	jz	L(exit_tail4)
+	cmp	$0, 20(%edx)
+	jz	L(exit_tail5)
+	cmp	$0, 24(%edx)
+	jz	L(exit_tail6)
+	cmp	$0, 28(%edx)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%edx), %eax
+	lea	16(%edx), %ecx
+	and	$-16, %eax
+
+	pcmpeqd	(%eax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	pcmpeqd	(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%eax), %eax
+	jnz	L(exit)
+
+	and	$-0x40, %eax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%eax), %xmm0
+	movaps	16(%eax), %xmm1
+	movaps	32(%eax), %xmm2
+	movaps	48(%eax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	64(%eax), %eax
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	48(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%eax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	-16(%ecx), %ecx
+	jnz	L(exit)
+
+	jmp	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%ecx, %eax
+	shr	$2, %eax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(exit_1)
+	ret
+
+	.p2align 4
+L(exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(exit_3)
+	add	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_1):
+	add	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	add	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail0):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_tail1):
+	mov	$1, %eax
+	ret
+
+	.p2align 4
+L(exit_tail2):
+	mov	$2, %eax
+	ret
+
+	.p2align 4
+L(exit_tail3):
+	mov	$3, %eax
+	ret
+
+	.p2align 4
+L(exit_tail4):
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(exit_tail5):
+	mov	$5, %eax
+	ret
+
+	.p2align 4
+L(exit_tail6):
+	mov	$6, %eax
+	ret
+
+	.p2align 4
+L(exit_tail7):
+	mov	$7, %eax
+	ret
+
+END (__wcslen_sse2)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen.S b/libc/sysdeps/i386/i686/multiarch/wcslen.S
new file mode 100644
index 000000000..58670377e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wcslen.S
@@ -0,0 +1,56 @@
+/* Multiple versions of wcslen
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wcslen_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wcslen_sse2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(__wcslen)
+
+weak_alias(__wcslen, wcslen)
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 000000000..94ff6151f
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 000000000..1a857c7e2
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 000000000..a41ef95fc
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S
new file mode 100644
index 000000000..5080c14ea
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -0,0 +1,59 @@
+/* Multiple versions of wmemcmp
+   Copyright (C)  2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+
+#ifndef NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+	__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wmemcmp_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wmemcmp_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(wmemcmp)
+#endif
diff --git a/libc/sysdeps/i386/sysdep.h b/libc/sysdeps/i386/sysdep.h
index efdc82dde..a8a9e571b 100644
--- a/libc/sysdeps/i386/sysdep.h
+++ b/libc/sysdeps/i386/sysdep.h
@@ -1,5 +1,5 @@
 /* Assembler macros for i386.
-   Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006
+   Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006,2011
    Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -167,4 +167,6 @@ __i686.get_pc_thunk.reg:						      \
 #endif
 #endif
 
+#define atom_text_section .section ".text.atom", "ax"
+
 #endif	/* __ASSEMBLER__ */