diff options
Diffstat (limited to 'libc/sysdeps/i386')
90 files changed, 5446 insertions, 819 deletions
diff --git a/libc/sysdeps/i386/bits/select.h b/libc/sysdeps/i386/bits/select.h index ab9aa3d10..9e4c56aa8 100644 --- a/libc/sysdeps/i386/bits/select.h +++ b/libc/sysdeps/i386/bits/select.h @@ -48,8 +48,8 @@ #endif /* GNU CC */ #define __FD_SET(d, set) \ - ((void) (__FDS_BITS (set)[__FDELT (d)] |= __FDMASK (d))) + ((void) (__FDS_BITS (set)[__FD_ELT (d)] |= __FD_MASK (d))) #define __FD_CLR(d, set) \ - ((void) (__FDS_BITS (set)[__FDELT (d)] &= ~__FDMASK (d))) + ((void) (__FDS_BITS (set)[__FD_ELT (d)] &= ~__FD_MASK (d))) #define __FD_ISSET(d, set) \ - ((__FDS_BITS (set)[__FDELT (d)] & __FDMASK (d)) != 0) + ((__FDS_BITS (set)[__FD_ELT (d)] & __FD_MASK (d)) != 0) diff --git a/libc/sysdeps/i386/configure b/libc/sysdeps/i386/configure index e8a7970e3..cd4e627ea 100755 --- a/libc/sysdeps/i386/configure +++ b/libc/sysdeps/i386/configure @@ -682,6 +682,31 @@ _ACEOF fi +{ $as_echo "$as_me:$LINENO: checking for FMA4 support" >&5 +$as_echo_n "checking for FMA4 support... " >&6; } +if test "${libc_cv_cc_fma4+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_fma4=yes +else + libc_cv_cc_fma4=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_fma4" >&5 +$as_echo "$libc_cv_cc_fma4" >&6; } +if test $libc_cv_cc_fma4 = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_FMA4_SUPPORT 1 +_ACEOF + +fi + { $as_echo "$as_me:$LINENO: checking for -mno-vzeroupper support" >&5 $as_echo_n "checking for -mno-vzeroupper support... " >&6; } if test "${libc_cv_cc_novzeroupper+set}" = set; then diff --git a/libc/sysdeps/i386/configure.in b/libc/sysdeps/i386/configure.in index 67fd1d7df..5a9840e16 100644 --- a/libc/sysdeps/i386/configure.in +++ b/libc/sysdeps/i386/configure.in @@ -67,6 +67,17 @@ if test $libc_cv_cc_avx = yes; then AC_DEFINE(HAVE_AVX_SUPPORT) fi +dnl Check if -mfma4 works. +AC_CACHE_CHECK(for FMA4 support, libc_cv_cc_fma4, [dnl +if AC_TRY_COMMAND([${CC-cc} -mfma4 -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_fma4=yes +else + libc_cv_cc_fma4=no +fi]) +if test $libc_cv_cc_fma4 = yes; then + AC_DEFINE(HAVE_FMA4_SUPPORT) +fi + dnl Check if -mno-vzeroupper works. AC_CACHE_CHECK(for -mno-vzeroupper support, libc_cv_cc_novzeroupper, [dnl if AC_TRY_COMMAND([${CC-cc} -mno-vzeroupper -xc /dev/null -S -o /dev/null]); then diff --git a/libc/sysdeps/i386/dl-machine.h b/libc/sysdeps/i386/dl-machine.h index a093d2b15..9469a2b5d 100644 --- a/libc/sysdeps/i386/dl-machine.h +++ b/libc/sysdeps/i386/dl-machine.h @@ -1,5 +1,5 @@ /* Machine-dependent ELF dynamic relocation inline functions. i386 version. - Copyright (C) 1995-2005, 2006, 2009 Free Software Foundation, Inc. + Copyright (C) 1995-2005, 2006, 2009, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -243,18 +243,12 @@ _dl_start_user:\n\ define the value. ELF_RTYPE_CLASS_NOCOPY iff TYPE should not be allowed to resolve to one of the main executable's symbols, as for a COPY reloc. */ -#if !defined RTLD_BOOTSTRAP || USE___THREAD # define elf_machine_type_class(type) \ ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32 \ || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32 \ || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC) \ * ELF_RTYPE_CLASS_PLT) \ | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY)) -#else -# define elf_machine_type_class(type) \ - ((((type) == R_386_JMP_SLOT) * ELF_RTYPE_CLASS_PLT) \ - | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY)) -#endif /* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */ #define ELF_MACHINE_JMP_SLOT R_386_JMP_SLOT @@ -311,7 +305,7 @@ auto inline void __attribute ((always_inline)) elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, const Elf32_Sym *sym, const struct r_found_version *version, - void *const reloc_addr_arg) + void *const reloc_addr_arg, int skip_ifunc) { Elf32_Addr *const reloc_addr = reloc_addr_arg; const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); @@ -347,7 +341,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, if (sym != NULL && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) - && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1)) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) + && __builtin_expect (!skip_ifunc, 1)) value = ((Elf32_Addr (*) (void)) value) (); switch (r_type) @@ -357,44 +352,43 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, *reloc_addr = value; break; -# if !defined RTLD_BOOTSTRAP || USE___THREAD case R_386_TLS_DTPMOD32: -# ifdef RTLD_BOOTSTRAP +# ifdef RTLD_BOOTSTRAP /* During startup the dynamic linker is always the module with index 1. XXX If this relocation is necessary move before RESOLVE call. */ *reloc_addr = 1; -# else +# else /* Get the information from the link map returned by the resolv function. */ if (sym_map != NULL) *reloc_addr = sym_map->l_tls_modid; -# endif +# endif break; case R_386_TLS_DTPOFF32: -# ifndef RTLD_BOOTSTRAP +# ifndef RTLD_BOOTSTRAP /* During relocation all TLS symbols are defined and used. Therefore the offset is already correct. */ if (sym != NULL) *reloc_addr = sym->st_value; -# endif +# endif break; case R_386_TLS_DESC: { struct tlsdesc volatile *td = (struct tlsdesc volatile *)reloc_addr; -# ifndef RTLD_BOOTSTRAP +# ifndef RTLD_BOOTSTRAP if (! sym) td->entry = _dl_tlsdesc_undefweak; else -# endif +# endif { -# ifndef RTLD_BOOTSTRAP -# ifndef SHARED +# ifndef RTLD_BOOTSTRAP +# ifndef SHARED CHECK_STATIC_TLS (map, sym_map); -# else +# else if (!TRY_STATIC_TLS (map, sym_map)) { td->arg = _dl_make_tlsdesc_dynamic @@ -402,8 +396,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, td->entry = _dl_tlsdesc_dynamic; } else -# endif # endif +# endif { td->arg = (void*)(sym->st_value - sym_map->l_tls_offset + (ElfW(Word))td->arg); @@ -426,13 +420,13 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, CHECK_STATIC_TLS (map, sym_map); *reloc_addr += sym_map->l_tls_offset - sym->st_value; } -# endif +# endif break; case R_386_TLS_TPOFF: /* The offset is negative, forward from the thread pointer. */ -# ifdef RTLD_BOOTSTRAP +# ifdef RTLD_BOOTSTRAP *reloc_addr += sym->st_value - map->l_tls_offset; -# else +# else /* We know the offset of object the symbol is contained in. It is a negative value which will be added to the thread pointer. */ @@ -441,9 +435,8 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, CHECK_STATIC_TLS (map, sym_map); *reloc_addr += sym->st_value - sym_map->l_tls_offset; } -# endif +# endif break; -# endif /* use TLS */ # ifndef RTLD_BOOTSTRAP case R_386_32: @@ -490,7 +483,7 @@ auto inline void __attribute__ ((always_inline)) elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, const Elf32_Sym *sym, const struct r_found_version *version, - void *const reloc_addr_arg) + void *const reloc_addr_arg, int skip_ifunc) { Elf32_Addr *const reloc_addr = reloc_addr_arg; const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); @@ -507,8 +500,8 @@ elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, if (sym != NULL && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) - && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, - 0)) + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (!skip_ifunc, 1)) value = ((Elf32_Addr (*) (void)) value) (); switch (ELF32_R_TYPE (reloc->r_info)) @@ -655,7 +648,8 @@ elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc, auto inline void __attribute__ ((always_inline)) elf_machine_lazy_rel (struct link_map *map, - Elf32_Addr l_addr, const Elf32_Rel *reloc) + Elf32_Addr l_addr, const Elf32_Rel *reloc, + int skip_ifunc) { Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); @@ -706,19 +700,20 @@ elf_machine_lazy_rel (struct link_map *map, ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff; elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], &map->l_versions[ndx], - (void *) (l_addr + r->r_offset)); + (void *) (l_addr + r->r_offset), skip_ifunc); } # ifndef RTLD_BOOTSTRAP else elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL, - (void *) (l_addr + r->r_offset)); + (void *) (l_addr + r->r_offset), skip_ifunc); # endif } } else if (__builtin_expect (r_type == R_386_IRELATIVE, 0)) { Elf32_Addr value = map->l_addr + *reloc_addr; - value = ((Elf32_Addr (*) (void)) value) (); + if (__builtin_expect (!skip_ifunc, 1)) + value = ((Elf32_Addr (*) (void)) value) (); *reloc_addr = value; } else @@ -730,7 +725,8 @@ elf_machine_lazy_rel (struct link_map *map, auto inline void __attribute__ ((always_inline)) elf_machine_lazy_rela (struct link_map *map, - Elf32_Addr l_addr, const Elf32_Rela *reloc) + Elf32_Addr l_addr, const Elf32_Rela *reloc, + int skip_ifunc) { Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); @@ -747,7 +743,8 @@ elf_machine_lazy_rela (struct link_map *map, else if (__builtin_expect (r_type == R_386_IRELATIVE, 0)) { Elf32_Addr value = map->l_addr + reloc->r_addend; - value = ((Elf32_Addr (*) (void)) value) (); + if (__builtin_expect (!skip_ifunc, 1)) + value = ((Elf32_Addr (*) (void)) value) (); *reloc_addr = value; } else diff --git a/libc/sysdeps/i386/elf/configure b/libc/sysdeps/i386/elf/configure index 7a909d9a5..88edda0a1 100755 --- a/libc/sysdeps/i386/elf/configure +++ b/libc/sysdeps/i386/elf/configure @@ -1,7 +1,6 @@ # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/i386/elf. -if test "$usetls" != no; then # Check for support of thread-local storage handling in assembler and # linker. { $as_echo "$as_me:$LINENO: checking for i386 TLS support" >&5 @@ -39,12 +38,10 @@ rm -f conftest* fi { $as_echo "$as_me:$LINENO: result: $libc_cv_386_tls" >&5 $as_echo "$libc_cv_386_tls" >&6; } -if test $libc_cv_386_tls = yes; then - cat >>confdefs.h <<\_ACEOF -#define HAVE_TLS_SUPPORT 1 -_ACEOF - -fi +if test $libc_cv_386_tls = no; then + { { $as_echo "$as_me:$LINENO: error: the assembler must support TLS" >&5 +$as_echo "$as_me: error: the assembler must support TLS" >&2;} + { (exit 1); exit 1; }; } fi cat >>confdefs.h <<\_ACEOF diff --git a/libc/sysdeps/i386/elf/configure.in b/libc/sysdeps/i386/elf/configure.in index ca607adeb..0c436f3f4 100644 --- a/libc/sysdeps/i386/elf/configure.in +++ b/libc/sysdeps/i386/elf/configure.in @@ -1,7 +1,6 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. # Local configure fragment for sysdeps/i386/elf. -if test "$usetls" != no; then # Check for support of thread-local storage handling in assembler and # linker. AC_CACHE_CHECK(for i386 TLS support, libc_cv_386_tls, [dnl @@ -28,9 +27,8 @@ else libc_cv_386_tls=no fi rm -f conftest*]) -if test $libc_cv_386_tls = yes; then - AC_DEFINE(HAVE_TLS_SUPPORT) -fi +if test $libc_cv_386_tls = no; then + AC_MSG_ERROR([the assembler must support TLS]) fi dnl It is always possible to access static and hidden symbols in an diff --git a/libc/sysdeps/i386/fpu/e_acos.S b/libc/sysdeps/i386/fpu/e_acos.S index b9d07b109..d3505baf0 100644 --- a/libc/sysdeps/i386/fpu/e_acos.S +++ b/libc/sysdeps/i386/fpu/e_acos.S @@ -19,3 +19,4 @@ ENTRY(__ieee754_acos) fpatan /* atan (sqrt(1 - x^2) / x) */ ret END (__ieee754_acos) +strong_alias (__ieee754_acos, __acos_finite) diff --git a/libc/sysdeps/i386/fpu/e_acosf.S b/libc/sysdeps/i386/fpu/e_acosf.S index 50b13fd1b..6a843a51d 100644 --- a/libc/sysdeps/i386/fpu/e_acosf.S +++ b/libc/sysdeps/i386/fpu/e_acosf.S @@ -20,3 +20,4 @@ ENTRY(__ieee754_acosf) fpatan ret END (__ieee754_acosf) +strong_alias (__ieee754_acosf, __acosf_finite) diff --git a/libc/sysdeps/i386/fpu/e_acosh.S b/libc/sysdeps/i386/fpu/e_acosh.S index 62a232471..fc65c295c 100644 --- a/libc/sysdeps/i386/fpu/e_acosh.S +++ b/libc/sysdeps/i386/fpu/e_acosh.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arcsinh. - Copyright (C) 1996, 2005 Free Software Foundation, Inc. + Copyright (C) 1996, 2005, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -21,12 +21,12 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -101,3 +101,4 @@ ENTRY(__ieee754_acosh) fdiv %st, %st(0) ret END(__ieee754_acosh) +strong_alias (__ieee754_acosh, __acosh_finite) diff --git a/libc/sysdeps/i386/fpu/e_acoshf.S b/libc/sysdeps/i386/fpu/e_acoshf.S index 1906c6057..b55004b62 100644 --- a/libc/sysdeps/i386/fpu/e_acoshf.S +++ b/libc/sysdeps/i386/fpu/e_acoshf.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arcsinh. - Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -21,12 +21,12 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -101,3 +101,4 @@ ENTRY(__ieee754_acoshf) fdiv %st, %st(0) ret END(__ieee754_acoshf) +strong_alias (__ieee754_acoshf, __acoshf_finite) diff --git a/libc/sysdeps/i386/fpu/e_acoshl.S b/libc/sysdeps/i386/fpu/e_acoshl.S index c7b548d25..76bc0d752 100644 --- a/libc/sysdeps/i386/fpu/e_acoshl.S +++ b/libc/sysdeps/i386/fpu/e_acoshl.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arcsinh. - Copyright (C) 1996, 1997, 2005 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 2005, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -21,12 +21,12 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 /* Please note that we use double value for 1.0. This number has an exact representation and so we don't get accuracy problems. The advantage is that the code is simpler. */ @@ -108,3 +108,4 @@ ENTRY(__ieee754_acoshl) fdiv %st, %st(0) ret END(__ieee754_acoshl) +strong_alias (__ieee754_acoshl, __acoshl_finite) diff --git a/libc/sysdeps/i386/fpu/e_acosl.c b/libc/sysdeps/i386/fpu/e_acosl.c index 0c3e03945..ec516ffca 100644 --- a/libc/sysdeps/i386/fpu/e_acosl.c +++ b/libc/sysdeps/i386/fpu/e_acosl.c @@ -23,3 +23,4 @@ __ieee754_acosl (long double x) : "=t" (res) : "0" (x) : "st(1)"); return res; } +strong_alias (__ieee754_acosl, __acosl_finite) diff --git a/libc/sysdeps/i386/fpu/e_asin.S b/libc/sysdeps/i386/fpu/e_asin.S index 945e30824..a17e922b6 100644 --- a/libc/sysdeps/i386/fpu/e_asin.S +++ b/libc/sysdeps/i386/fpu/e_asin.S @@ -18,3 +18,4 @@ ENTRY(__ieee754_asin) fpatan ret END (__ieee754_asin) +strong_alias (__ieee754_asin, __asin_finite) diff --git a/libc/sysdeps/i386/fpu/e_asinf.S b/libc/sysdeps/i386/fpu/e_asinf.S index d450e9a74..5c1065dd4 100644 --- a/libc/sysdeps/i386/fpu/e_asinf.S +++ b/libc/sysdeps/i386/fpu/e_asinf.S @@ -19,3 +19,4 @@ ENTRY(__ieee754_asinf) fpatan ret END (__ieee754_asinf) +strong_alias (__ieee754_asinf, __asinf_finite) diff --git a/libc/sysdeps/i386/fpu/e_atan2.S b/libc/sysdeps/i386/fpu/e_atan2.S index 8df04e485..e76f8e2a7 100644 --- a/libc/sysdeps/i386/fpu/e_atan2.S +++ b/libc/sysdeps/i386/fpu/e_atan2.S @@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2) fpatan ret END (__ieee754_atan2) +strong_alias (__ieee754_atan2, __atan2_finite) diff --git a/libc/sysdeps/i386/fpu/e_atan2f.S b/libc/sysdeps/i386/fpu/e_atan2f.S index fc6621f18..9ffa6373b 100644 --- a/libc/sysdeps/i386/fpu/e_atan2f.S +++ b/libc/sysdeps/i386/fpu/e_atan2f.S @@ -13,3 +13,4 @@ ENTRY(__ieee754_atan2f) fpatan ret END (__ieee754_atan2f) +strong_alias (__ieee754_atan2f, __atan2f_finite) diff --git a/libc/sysdeps/i386/fpu/e_atan2l.c b/libc/sysdeps/i386/fpu/e_atan2l.c index 19a2a6062..9f88bfcc0 100644 --- a/libc/sysdeps/i386/fpu/e_atan2l.c +++ b/libc/sysdeps/i386/fpu/e_atan2l.c @@ -16,3 +16,4 @@ __ieee754_atan2l (long double y, long double x) return res; } +strong_alias (__ieee754_atan2l, __atan2l_finite) diff --git a/libc/sysdeps/i386/fpu/e_atanh.S b/libc/sysdeps/i386/fpu/e_atanh.S index 3566ec6ef..d7e53a288 100644 --- a/libc/sysdeps/i386/fpu/e_atanh.S +++ b/libc/sysdeps/i386/fpu/e_atanh.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arctanh function. - Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc. + Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -114,3 +114,4 @@ ENTRY(__ieee754_atanh) 6: fldl 4(%esp) ret END(__ieee754_atanh) +strong_alias (__ieee754_atanh, __atanh_finite) diff --git a/libc/sysdeps/i386/fpu/e_atanhf.S b/libc/sysdeps/i386/fpu/e_atanhf.S index 10ce6aed9..00ad9142f 100644 --- a/libc/sysdeps/i386/fpu/e_atanhf.S +++ b/libc/sysdeps/i386/fpu/e_atanhf.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arctanh function. - Copyright (C) 1996, 1999, 2005 Free Software Foundation, Inc. + Copyright (C) 1996, 1999, 2005, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -107,3 +107,4 @@ ENTRY(__ieee754_atanhf) 5: flds 4(%esp) ret END(__ieee754_atanhf) +strong_alias (__ieee754_atanhf, __atanhf_finite) diff --git a/libc/sysdeps/i386/fpu/e_atanhl.S b/libc/sysdeps/i386/fpu/e_atanhl.S index 8618c3fb3..cc70e73f4 100644 --- a/libc/sysdeps/i386/fpu/e_atanhl.S +++ b/libc/sysdeps/i386/fpu/e_atanhl.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of arctanh function. - Copyright (C) 1996, 1999 Free Software Foundation, Inc. + Copyright (C) 1996, 1999, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -118,3 +118,4 @@ ENTRY(__ieee754_atanhl) 6: fldt 4(%esp) ret END(__ieee754_atanhl) +strong_alias (__ieee754_atanhl, __atanhl_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp.S b/libc/sysdeps/i386/fpu/e_exp.S index 4a75fa1d1..2c331d9ed 100644 --- a/libc/sysdeps/i386/fpu/e_exp.S +++ b/libc/sysdeps/i386/fpu/e_exp.S @@ -5,7 +5,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_exp.S,v 1.7 1996/07/03 17:31:28 jtc Exp $") /* e^x = 2^(x * log2(e)) */ ENTRY(__ieee754_exp) @@ -39,3 +38,19 @@ ENTRY(__ieee754_exp) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp) + + +ENTRY(__exp_finite) + fldl2e + fmull 4(%esp) /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + ret +END(__exp_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp10.S b/libc/sysdeps/i386/fpu/e_exp10.S index 6bfcdbb72..1e32b0784 100644 --- a/libc/sysdeps/i386/fpu/e_exp10.S +++ b/libc/sysdeps/i386/fpu/e_exp10.S @@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp10) +strong_alias (__ieee754_exp10, __exp10_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp10f.S b/libc/sysdeps/i386/fpu/e_exp10f.S index 4791b99af..614496415 100644 --- a/libc/sysdeps/i386/fpu/e_exp10f.S +++ b/libc/sysdeps/i386/fpu/e_exp10f.S @@ -4,7 +4,7 @@ #include <machine/asm.h> -/* e^x = 2^(x * log2(10)) */ +/* 10^x = 2^(x * log2(10)) */ ENTRY(__ieee754_exp10f) flds 4(%esp) /* I added the following ugly construct because exp(+-Inf) resulted @@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10f) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp10f) +strong_alias (__ieee754_exp10f, __exp10f_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp10l.S b/libc/sysdeps/i386/fpu/e_exp10l.S index 71f0da792..04ec8001d 100644 --- a/libc/sysdeps/i386/fpu/e_exp10l.S +++ b/libc/sysdeps/i386/fpu/e_exp10l.S @@ -4,7 +4,7 @@ #include <machine/asm.h> -/* e^x = 2^(x * log2l(10)) */ +/* 10^x = 2^(x * log2l(10)) */ ENTRY(__ieee754_exp10l) fldt 4(%esp) /* I added the following ugly construct because expl(+-Inf) resulted @@ -36,3 +36,4 @@ ENTRY(__ieee754_exp10l) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp10l) +strong_alias (__ieee754_exp10l, __exp10l_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp2.S b/libc/sysdeps/i386/fpu/e_exp2.S index 778c0c0eb..f802cf8b9 100644 --- a/libc/sysdeps/i386/fpu/e_exp2.S +++ b/libc/sysdeps/i386/fpu/e_exp2.S @@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp2) +strong_alias (__ieee754_exp2, __exp2_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp2f.S b/libc/sysdeps/i386/fpu/e_exp2f.S index c2d1af1af..f867d0d47 100644 --- a/libc/sysdeps/i386/fpu/e_exp2f.S +++ b/libc/sysdeps/i386/fpu/e_exp2f.S @@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2f) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp2f) +strong_alias (__ieee754_exp2f, __exp2f_finite) diff --git a/libc/sysdeps/i386/fpu/e_exp2l.S b/libc/sysdeps/i386/fpu/e_exp2l.S index fa1fdc923..203dd0078 100644 --- a/libc/sysdeps/i386/fpu/e_exp2l.S +++ b/libc/sysdeps/i386/fpu/e_exp2l.S @@ -35,3 +35,4 @@ ENTRY(__ieee754_exp2l) fldz /* Set result to 0. */ 2: ret END (__ieee754_exp2l) +strong_alias (__ieee754_exp2l, __exp2l_finite) diff --git a/libc/sysdeps/i386/fpu/e_expf.S b/libc/sysdeps/i386/fpu/e_expf.S index 5fd49b89f..4e4f6a0df 100644 --- a/libc/sysdeps/i386/fpu/e_expf.S +++ b/libc/sysdeps/i386/fpu/e_expf.S @@ -6,7 +6,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") /* e^x = 2^(x * log2(e)) */ ENTRY(__ieee754_expf) @@ -40,3 +39,19 @@ ENTRY(__ieee754_expf) fldz /* Set result to 0. */ 2: ret END (__ieee754_expf) + + +ENTRY(__expf_finite) + fldl2e + fmuls 4(%esp) /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + ret +END(__expf_finite) diff --git a/libc/sysdeps/i386/fpu/e_expl.c b/libc/sysdeps/i386/fpu/e_expl.c index 2240ceac4..8dc9581f7 100644 --- a/libc/sysdeps/i386/fpu/e_expl.c +++ b/libc/sysdeps/i386/fpu/e_expl.c @@ -63,7 +63,7 @@ __ieee754_expl (long double x) "fld1\n\t" /* 4 1.0 */ "faddp\n\t" /* 3 2^(fract(x * log2(e))) */ "fstp %%st(1)\n\t" /* 2 */ - "fscale\n\t" /* 2 scale factor is st(1); e^x */ + "fscale\n\t" /* 2 scale factor is st(1); e^x */ "fstp %%st(1)\n\t" /* 1 */ "fstp %%st(1)\n\t" /* 0 */ "jmp 2f\n\t" @@ -75,3 +75,4 @@ __ieee754_expl (long double x) : "=t" (res) : "0" (x), "m" (c0), "m" (c1) : "ax", "dx"); return res; } +strong_alias (__ieee754_expl, __expl_finite) diff --git a/libc/sysdeps/i386/fpu/e_fmod.S b/libc/sysdeps/i386/fpu/e_fmod.S index 4cf6e9205..26b3acc39 100644 --- a/libc/sysdeps/i386/fpu/e_fmod.S +++ b/libc/sysdeps/i386/fpu/e_fmod.S @@ -5,8 +5,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_fmod.S,v 1.4 1995/05/08 23:47:56 jtc Exp $") - ENTRY(__ieee754_fmod) fldl 12(%esp) fldl 4(%esp) @@ -17,3 +15,4 @@ ENTRY(__ieee754_fmod) fstp %st(1) ret END (__ieee754_fmod) +strong_alias (__ieee754_fmod, __fmod_finite) diff --git a/libc/sysdeps/i386/fpu/e_fmodf.S b/libc/sysdeps/i386/fpu/e_fmodf.S index bbce40976..ece4d9842 100644 --- a/libc/sysdeps/i386/fpu/e_fmodf.S +++ b/libc/sysdeps/i386/fpu/e_fmodf.S @@ -6,8 +6,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - ENTRY(__ieee754_fmodf) flds 8(%esp) flds 4(%esp) @@ -18,3 +16,4 @@ ENTRY(__ieee754_fmodf) fstp %st(1) ret END(__ieee754_fmodf) +strong_alias (__ieee754_fmodf, __fmodf_finite) diff --git a/libc/sysdeps/i386/fpu/e_fmodl.c b/libc/sysdeps/i386/fpu/e_fmodl.c index c7c9a6045..49700ae8f 100644 --- a/libc/sysdeps/i386/fpu/e_fmodl.c +++ b/libc/sysdeps/i386/fpu/e_fmodl.c @@ -20,3 +20,4 @@ __ieee754_fmodl (long double x, long double y) : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)"); return res; } +strong_alias (__ieee754_fmodl, __fmodl_finite) diff --git a/libc/sysdeps/i386/fpu/e_hypot.S b/libc/sysdeps/i386/fpu/e_hypot.S index 043585730..0baa011d1 100644 --- a/libc/sysdeps/i386/fpu/e_hypot.S +++ b/libc/sysdeps/i386/fpu/e_hypot.S @@ -1,5 +1,5 @@ /* Compute the hypothenuse of X and Y. - Copyright (C) 1998 Free Software Foundation, Inc. + Copyright (C) 1998, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. @@ -58,5 +58,6 @@ ENTRY(__ieee754_hypot) fxch 5: fstp %st(1) jmp 2b - + END(__ieee754_hypot) +strong_alias (__ieee754_hypot, __hypot_finite) diff --git a/libc/sysdeps/i386/fpu/e_hypotf.S b/libc/sysdeps/i386/fpu/e_hypotf.S index 5967dae21..eb95d6ee9 100644 --- a/libc/sysdeps/i386/fpu/e_hypotf.S +++ b/libc/sysdeps/i386/fpu/e_hypotf.S @@ -1,5 +1,5 @@ /* Compute the hypothenuse of X and Y. - Copyright (C) 1998 Free Software Foundation, Inc. + Copyright (C) 1998, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. @@ -58,5 +58,6 @@ ENTRY(__ieee754_hypotf) fxch 5: fstp %st(1) jmp 2b - + END(__ieee754_hypotf) +strong_alias (__ieee754_hypotf, __hypotf_finite) diff --git a/libc/sysdeps/i386/fpu/e_log.S b/libc/sysdeps/i386/fpu/e_log.S index ce55b7229..a2e4d89a4 100644 --- a/libc/sysdeps/i386/fpu/e_log.S +++ b/libc/sysdeps/i386/fpu/e_log.S @@ -7,14 +7,12 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -27,9 +25,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -64,3 +62,22 @@ ENTRY(__ieee754_log) fstp %st(1) ret END (__ieee754_log) + +ENTRY(__log_finite) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__log_finite) diff --git a/libc/sysdeps/i386/fpu/e_log10.S b/libc/sysdeps/i386/fpu/e_log10.S index 525f08c96..9d24d7402 100644 --- a/libc/sysdeps/i386/fpu/e_log10.S +++ b/libc/sysdeps/i386/fpu/e_log10.S @@ -7,14 +7,12 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_log10.S,v 1.4 1995/05/08 23:49:24 jtc Exp $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -27,9 +25,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -64,3 +62,4 @@ ENTRY(__ieee754_log10) fstp %st(1) ret END (__ieee754_log10) +strong_alias (__ieee754_log10, __log10_finite) diff --git a/libc/sysdeps/i386/fpu/e_log10f.S b/libc/sysdeps/i386/fpu/e_log10f.S index da5069d58..38a4833d1 100644 --- a/libc/sysdeps/i386/fpu/e_log10f.S +++ b/libc/sysdeps/i386/fpu/e_log10f.S @@ -8,14 +8,12 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -28,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -65,3 +63,4 @@ ENTRY(__ieee754_log10f) fstp %st(1) ret END (__ieee754_log10f) +strong_alias (__ieee754_log10f, __log10f_finite) diff --git a/libc/sysdeps/i386/fpu/e_log10l.S b/libc/sysdeps/i386/fpu/e_log10l.S index 3811516be..88b309d53 100644 --- a/libc/sysdeps/i386/fpu/e_log10l.S +++ b/libc/sysdeps/i386/fpu/e_log10l.S @@ -9,14 +9,12 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -29,9 +27,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -66,3 +64,4 @@ ENTRY(__ieee754_log10l) fstp %st(1) ret END(__ieee754_log10l) +strong_alias (__ieee754_log10l, __log10l_finite) diff --git a/libc/sysdeps/i386/fpu/e_log2.S b/libc/sysdeps/i386/fpu/e_log2.S index d80bf8023..88aee7f3c 100644 --- a/libc/sysdeps/i386/fpu/e_log2.S +++ b/libc/sysdeps/i386/fpu/e_log2.S @@ -9,11 +9,11 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -26,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -63,3 +63,4 @@ ENTRY(__ieee754_log2) fstp %st(1) ret END (__ieee754_log2) +strong_alias (__ieee754_log2, __log2_finite) diff --git a/libc/sysdeps/i386/fpu/e_log2f.S b/libc/sysdeps/i386/fpu/e_log2f.S index 9eb7b2a82..20144875f 100644 --- a/libc/sysdeps/i386/fpu/e_log2f.S +++ b/libc/sysdeps/i386/fpu/e_log2f.S @@ -9,11 +9,11 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -26,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -63,3 +63,4 @@ ENTRY(__ieee754_log2f) fstp %st(1) ret END (__ieee754_log2f) +strong_alias (__ieee754_log2f, __log2f_finite) diff --git a/libc/sysdeps/i386/fpu/e_log2l.S b/libc/sysdeps/i386/fpu/e_log2l.S index 9de08f5de..bc79dea2d 100644 --- a/libc/sysdeps/i386/fpu/e_log2l.S +++ b/libc/sysdeps/i386/fpu/e_log2l.S @@ -9,11 +9,11 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -26,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -63,3 +63,4 @@ ENTRY(__ieee754_log2l) fstp %st(1) ret END (__ieee754_log2l) +strong_alias (__ieee754_log2l, __log2l_finite) diff --git a/libc/sysdeps/i386/fpu/e_logf.S b/libc/sysdeps/i386/fpu/e_logf.S index cd4538b59..1992cc2f8 100644 --- a/libc/sysdeps/i386/fpu/e_logf.S +++ b/libc/sysdeps/i386/fpu/e_logf.S @@ -8,14 +8,12 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_log.S,v 1.4 1995/05/08 23:48:39 jtc Exp $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -28,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -65,3 +63,22 @@ ENTRY(__ieee754_logf) fstp %st(1) ret END (__ieee754_logf) + +ENTRY(__logf_finite) + fldln2 // log(2) + flds 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logf_finite) diff --git a/libc/sysdeps/i386/fpu/e_logl.S b/libc/sysdeps/i386/fpu/e_logl.S index 551dcf1e4..bfb72a30e 100644 --- a/libc/sysdeps/i386/fpu/e_logl.S +++ b/libc/sysdeps/i386/fpu/e_logl.S @@ -7,15 +7,13 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif - .align ALIGNARG(4) + .p2align 3 ASM_TYPE_DIRECTIVE(one,@object) one: .double 1.0 ASM_SIZE_DIRECTIVE(one) @@ -28,9 +26,9 @@ limit: .double 0.29 #ifdef PIC -#define MO(op) op##@GOTOFF(%edx) +# define MO(op) op##@GOTOFF(%edx) #else -#define MO(op) op +# define MO(op) op #endif .text @@ -65,3 +63,22 @@ ENTRY(__ieee754_logl) fstp %st(1) ret END (__ieee754_logl) + +ENTRY(__logl_finite) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logl_finite) diff --git a/libc/sysdeps/i386/fpu/e_pow.S b/libc/sysdeps/i386/fpu/e_pow.S index 792f92690..dccc67752 100644 --- a/libc/sysdeps/i386/fpu/e_pow.S +++ b/libc/sysdeps/i386/fpu/e_pow.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007 + Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -22,12 +22,27 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif + .p2align 3 + ASM_TYPE_DIRECTIVE(one,@object) +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + ASM_TYPE_DIRECTIVE(limit,@object) +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p63,@object) +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) - .align ALIGNARG(4) +#ifdef __ELF__ + .section .rodata.cst16,"aM",@progbits,16 +#else + .text +#endif + .p2align 3 ASM_TYPE_DIRECTIVE(infinity,@object) inf_zero: infinity: @@ -43,22 +58,13 @@ minfinity: mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 ASM_SIZE_DIRECTIVE(minf_mzero) - ASM_TYPE_DIRECTIVE(one,@object) -one: .double 1.0 - ASM_SIZE_DIRECTIVE(one) - ASM_TYPE_DIRECTIVE(limit,@object) -limit: .double 0.29 - ASM_SIZE_DIRECTIVE(limit) - ASM_TYPE_DIRECTIVE(p63,@object) -p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 - ASM_SIZE_DIRECTIVE(p63) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif .text @@ -360,3 +366,4 @@ ENTRY(__ieee754_pow) ret END(__ieee754_pow) +strong_alias (__ieee754_pow, __pow_finite) diff --git a/libc/sysdeps/i386/fpu/e_powf.S b/libc/sysdeps/i386/fpu/e_powf.S index c91545418..99c95bbdf 100644 --- a/libc/sysdeps/i386/fpu/e_powf.S +++ b/libc/sysdeps/i386/fpu/e_powf.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007 + Copyright (C) 1996, 1997, 1999, 2001, 2004, 2005, 2007, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -22,12 +22,27 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif + .p2align 3 + ASM_TYPE_DIRECTIVE(one,@object) +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + ASM_TYPE_DIRECTIVE(limit,@object) +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p31,@object) +p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41 + ASM_SIZE_DIRECTIVE(p31) - .align ALIGNARG(4) +#ifdef __ELF__ + .section .rodata.cst16,"aM",@progbits,16 +#else + .text +#endif + .p2align 3 ASM_TYPE_DIRECTIVE(infinity,@object) inf_zero: infinity: @@ -43,22 +58,13 @@ minfinity: mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 ASM_SIZE_DIRECTIVE(minf_mzero) - ASM_TYPE_DIRECTIVE(one,@object) -one: .double 1.0 - ASM_SIZE_DIRECTIVE(one) - ASM_TYPE_DIRECTIVE(limit,@object) -limit: .double 0.29 - ASM_SIZE_DIRECTIVE(limit) - ASM_TYPE_DIRECTIVE(p31,@object) -p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41 - ASM_SIZE_DIRECTIVE(p31) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif .text @@ -348,3 +354,4 @@ ENTRY(__ieee754_powf) ret END(__ieee754_powf) +strong_alias (__ieee754_powf, __powf_finite) diff --git a/libc/sysdeps/i386/fpu/e_powl.S b/libc/sysdeps/i386/fpu/e_powl.S index 621549620..34ace3576 100644 --- a/libc/sysdeps/i386/fpu/e_powl.S +++ b/libc/sysdeps/i386/fpu/e_powl.S @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007 + Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004, 2005, 2007, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. @@ -22,12 +22,27 @@ #include <machine/asm.h> #ifdef __ELF__ - .section .rodata + .section .rodata.cst8,"aM",@progbits,8 #else .text #endif + .p2align 3 + ASM_TYPE_DIRECTIVE(one,@object) +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + ASM_TYPE_DIRECTIVE(limit,@object) +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p63,@object) +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) - .align ALIGNARG(4) +#ifdef __ELF__ + .section .rodata.cst16,"aM",@progbits,16 +#else + .text +#endif + .p2align 3 ASM_TYPE_DIRECTIVE(infinity,@object) inf_zero: infinity: @@ -43,22 +58,13 @@ minfinity: mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 ASM_SIZE_DIRECTIVE(minf_mzero) - ASM_TYPE_DIRECTIVE(one,@object) -one: .double 1.0 - ASM_SIZE_DIRECTIVE(one) - ASM_TYPE_DIRECTIVE(limit,@object) -limit: .double 0.29 - ASM_SIZE_DIRECTIVE(limit) - ASM_TYPE_DIRECTIVE(p63,@object) -p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 - ASM_SIZE_DIRECTIVE(p63) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif .text @@ -360,3 +366,4 @@ ENTRY(__ieee754_powl) ret END(__ieee754_powl) +strong_alias (__ieee754_powl, __powl_finite) diff --git a/libc/sysdeps/i386/fpu/e_remainder.S b/libc/sysdeps/i386/fpu/e_remainder.S index 2f43cb894..f7867aa90 100644 --- a/libc/sysdeps/i386/fpu/e_remainder.S +++ b/libc/sysdeps/i386/fpu/e_remainder.S @@ -5,8 +5,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_remainder.S,v 1.4 1995/05/08 23:49:37 jtc Exp $") - ENTRY(__ieee754_remainder) fldl 12(%esp) fldl 4(%esp) @@ -17,3 +15,4 @@ ENTRY(__ieee754_remainder) fstp %st(1) ret END (__ieee754_remainder) +strong_alias (__ieee754_remainder, __remainder_finite) diff --git a/libc/sysdeps/i386/fpu/e_remainderf.S b/libc/sysdeps/i386/fpu/e_remainderf.S index 79f821993..cfd390bc6 100644 --- a/libc/sysdeps/i386/fpu/e_remainderf.S +++ b/libc/sysdeps/i386/fpu/e_remainderf.S @@ -5,8 +5,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_remainderf.S,v 1.2 1995/05/08 23:49:47 jtc Exp $") - ENTRY(__ieee754_remainderf) flds 8(%esp) flds 4(%esp) @@ -17,3 +15,4 @@ ENTRY(__ieee754_remainderf) fstp %st(1) ret END (__ieee754_remainderf) +strong_alias (__ieee754_remainderf, __remainderf_finite) diff --git a/libc/sysdeps/i386/fpu/e_remainderl.S b/libc/sysdeps/i386/fpu/e_remainderl.S index 5f50b626a..5ec23a37a 100644 --- a/libc/sysdeps/i386/fpu/e_remainderl.S +++ b/libc/sysdeps/i386/fpu/e_remainderl.S @@ -7,8 +7,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - ENTRY(__ieee754_remainderl) fldt 16(%esp) fldt 4(%esp) @@ -19,3 +17,4 @@ ENTRY(__ieee754_remainderl) fstp %st(1) ret END (__ieee754_remainderl) +strong_alias (__ieee754_remainderl, __remainderl_finite) diff --git a/libc/sysdeps/i386/fpu/e_scalb.S b/libc/sysdeps/i386/fpu/e_scalb.S index 7e334a361..0f3ec9619 100644 --- a/libc/sysdeps/i386/fpu/e_scalb.S +++ b/libc/sysdeps/i386/fpu/e_scalb.S @@ -7,8 +7,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $") - #ifdef __ELF__ .section .rodata #else @@ -20,18 +18,17 @@ RCSID("$NetBSD: e_scalb.S,v 1.4 1995/05/08 23:49:52 jtc Exp $") zero_nan: .double 0.0 nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f -minus_zero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f ASM_SIZE_DIRECTIVE(zero_nan) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif .text @@ -100,3 +97,4 @@ ENTRY(__ieee754_scalb) fdiv %st ret END(__ieee754_scalb) +strong_alias (__ieee754_scalb, __scalb_finite) diff --git a/libc/sysdeps/i386/fpu/e_scalbf.S b/libc/sysdeps/i386/fpu/e_scalbf.S index e99ee92cb..d11ca66d1 100644 --- a/libc/sysdeps/i386/fpu/e_scalbf.S +++ b/libc/sysdeps/i386/fpu/e_scalbf.S @@ -8,8 +8,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - #ifdef __ELF__ .section .rodata #else @@ -21,18 +19,17 @@ RCSID("$NetBSD: $") zero_nan: .double 0.0 nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f -minus_zero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f ASM_SIZE_DIRECTIVE(zero_nan) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif @@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbf) fdiv %st ret END(__ieee754_scalbf) +strong_alias (__ieee754_scalbf, __scalbf_finite) diff --git a/libc/sysdeps/i386/fpu/e_scalbl.S b/libc/sysdeps/i386/fpu/e_scalbl.S index 3f67d0bef..d8b216971 100644 --- a/libc/sysdeps/i386/fpu/e_scalbl.S +++ b/libc/sysdeps/i386/fpu/e_scalbl.S @@ -9,8 +9,6 @@ #include <machine/asm.h> -RCSID("$NetBSD: $") - #ifdef __ELF__ .section .rodata #else @@ -22,18 +20,17 @@ RCSID("$NetBSD: $") zero_nan: .double 0.0 nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f -minus_zero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80 .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f ASM_SIZE_DIRECTIVE(zero_nan) #ifdef PIC -#define MO(op) op##@GOTOFF(%ecx) -#define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) #else -#define MO(op) op -#define MOX(op,x,f) op(,x,f) +# define MO(op) op +# define MOX(op,x,f) op(,x,f) #endif .text @@ -102,3 +99,4 @@ ENTRY(__ieee754_scalbl) fdiv %st ret END(__ieee754_scalbl) +strong_alias (__ieee754_scalbl, __scalbl_finite) diff --git a/libc/sysdeps/i386/fpu/e_sqrt.S b/libc/sysdeps/i386/fpu/e_sqrt.S index 6f253d51a..1054ba453 100644 --- a/libc/sysdeps/i386/fpu/e_sqrt.S +++ b/libc/sysdeps/i386/fpu/e_sqrt.S @@ -5,10 +5,9 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_sqrt.S,v 1.4 1995/05/08 23:49:57 jtc Exp $") - ENTRY(__ieee754_sqrt) fldl 4(%esp) fsqrt ret END (__ieee754_sqrt) +strong_alias (__ieee754_sqrt, __sqrt_finite) diff --git a/libc/sysdeps/i386/fpu/e_sqrtf.S b/libc/sysdeps/i386/fpu/e_sqrtf.S index 5ce1ad054..6f7e4b015 100644 --- a/libc/sysdeps/i386/fpu/e_sqrtf.S +++ b/libc/sysdeps/i386/fpu/e_sqrtf.S @@ -5,10 +5,9 @@ #include <machine/asm.h> -RCSID("$NetBSD: e_sqrtf.S,v 1.2 1995/05/08 23:50:14 jtc Exp $") - ENTRY(__ieee754_sqrtf) flds 4(%esp) fsqrt ret END (__ieee754_sqrtf) +strong_alias (__ieee754_sqrtf, __sqrtf_finite) diff --git a/libc/sysdeps/i386/fpu/e_sqrtl.c b/libc/sysdeps/i386/fpu/e_sqrtl.c index 85f61bb38..41bcd7eeb 100644 --- a/libc/sysdeps/i386/fpu/e_sqrtl.c +++ b/libc/sysdeps/i386/fpu/e_sqrtl.c @@ -7,6 +7,7 @@ #include <math_private.h> +#undef __ieee754_sqrtl long double __ieee754_sqrtl (long double x) { @@ -16,3 +17,4 @@ __ieee754_sqrtl (long double x) return res; } +strong_alias (__ieee754_sqrtl, __sqrtl_finite) diff --git a/libc/sysdeps/i386/fpu/libm-test-ulps b/libc/sysdeps/i386/fpu/libm-test-ulps index 4b1a9e734..ebd46b0df 100644 --- a/libc/sysdeps/i386/fpu/libm-test-ulps +++ b/libc/sysdeps/i386/fpu/libm-test-ulps @@ -640,6 +640,52 @@ double: 1 idouble: 1 ildouble: 1 ldouble: 1 +Test "jn (2, 2.4048255576957729) == 0.43175480701968038399746111312430703": +float: 1 +ifloat: 1 +double: 1 +idouble: 1 +ldouble: 82 +ildouble: 82 +Test "jn (3, 2.4048255576957729) == 0.19899990535769083404042146764530813": +ldouble: 186 +ildouble: 186 +Test "jn (4, 2.4048255576957729) == 0.647466661641779720084932282551219891E-1": +ldouble: 185 +ildouble: 185 +Test: "jn (5, 2.4048255576957729) == 0.163892432048058525099230549946147698E-1": +float: 1 +ifloat: 1 +double: 1 +idouble: 1 +ldouble: 249 +ildouble: 249 +Test "jn (6, 2.4048255576957729) == 0.34048184720278336646673682895929161E-2": +float: 2 +ifloat: 2 +double: 1 +idouble: 1 +ldouble: 511 +ildouble: 511 +Test "jn (7, 2.4048255576957729) == 0.60068836573295394221291569249883076E-3": +float: 2 +ifloat: 2 +double: 1 +idouble: 1 +ldouble: 428 +ildouble: 428 +Test "jn (8, 2.4048255576957729) == 0.92165786705344923232879022467054148E-4": +float: 3 +ifloat: 3 +double: 1 +idouble: 1 +ldouble: 609 +ildouble: 609 +Test "jn (9, 2.4048255576957729) == 0.12517270977961513005428966643852564E-4": +float: 4 +ifloat: 4 +ldouble: 750 +ildouble: 750 # lgamma Test "lgamma (-0.5) == log(2*sqrt(pi))": @@ -1168,11 +1214,11 @@ ldouble: 1 Function: "jn": double: 5 -float: 2 +float: 4 idouble: 5 -ifloat: 2 -ildouble: 2 -ldouble: 2 +ifloat: 4 +ildouble: 750 +ldouble: 750 Function: "lgamma": double: 1 diff --git a/libc/sysdeps/i386/i686/fpu/e_log.S b/libc/sysdeps/i386/i686/fpu/e_log.S new file mode 100644 index 000000000..73060b088 --- /dev/null +++ b/libc/sysdeps/i386/i686/fpu/e_log.S @@ -0,0 +1,29 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + + + .text +ENTRY(__ieee754_log) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + fucomi %st + jp 3f + fyl2x // log(x) + ret + +3: fstp %st(1) + ret +END (__ieee754_log) + +ENTRY(__log_finite) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + fyl2x // log(x) + ret +END(__log_finite) diff --git a/libc/sysdeps/i386/i686/fpu/e_logf.S b/libc/sysdeps/i386/i686/fpu/e_logf.S new file mode 100644 index 000000000..6fd39d50d --- /dev/null +++ b/libc/sysdeps/i386/i686/fpu/e_logf.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float by Ulrich Drepper <drepper@cygnus.com>. + * + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + + + .text +ENTRY(__ieee754_logf) + fldln2 // log(2) + flds 4(%esp) // x : log(2) + fucomi %st + jp 3f + fyl2x // log(x) + ret + +3: fstp %st(1) + ret +END (__ieee754_logf) + +ENTRY(__logf_finite) + fldln2 // log(2) + flds 4(%esp) // x : log(2) + fyl2x // log(x) + ret +END(__logf_finite) diff --git a/libc/sysdeps/i386/i686/fpu/e_logl.S b/libc/sysdeps/i386/i686/fpu/e_logl.S new file mode 100644 index 000000000..4e79a5a4b --- /dev/null +++ b/libc/sysdeps/i386/i686/fpu/e_logl.S @@ -0,0 +1,81 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + +#ifdef __ELF__ + .section .rodata.cst8,"aM",@progbits,8 +#else + .text +#endif + .p2align 3 + ASM_TYPE_DIRECTIVE(one,@object) +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + ASM_TYPE_DIRECTIVE(limit,@object) +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_logl) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) + fucomi %st + jp 3f +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2) + fcomip %st(1) // |x-1| : x-1 : x : log(2) + fstp %st(0) // x-1 : x : log(2) + jc 2f + fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: fstp %st(1) + ret +END (__ieee754_logl) + +ENTRY(__logl_finite) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2) + fcomip %st(1) // |x-1| : x-1 : x : log(2) + fstp %st(0) // x-1 : x : log(2) + jc 2b + fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logl_finite) diff --git a/libc/sysdeps/i386/i686/multiarch/Makefile b/libc/sysdeps/i386/i686/multiarch/Makefile index c89ae9247..5f1853877 100644 --- a/libc/sysdeps/i386/i686/multiarch/Makefile +++ b/libc/sysdeps/i386/i686/multiarch/Makefile @@ -15,7 +15,11 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ - wcscmp-sse2 wcscmp-c + wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \ + memrchr-sse2 memrchr-sse2-bsf memrchr-c \ + rawmemchr-sse2 rawmemchr-sse2-bsf \ + strnlen-sse2 strnlen-c wcslen-sse2 wcslen-c \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S new file mode 100644 index 000000000..115a2192a --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S @@ -0,0 +1,497 @@ +/* Optimized memchr with sse2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2_bsf +# endif + + .text +ENTRY (MEMCHR) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null_1) +# endif + mov %ecx, %eax + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%eax), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %ecx + test %ecx, %ecx + je L(unaligned_no_match_1) +/* Check which byte is a match. */ + bsf %ecx, %ecx + +# ifndef USE_AS_RAWMEMCHR + sub %ecx, %edx + jbe L(return_null_1) +# endif + add %ecx, %eax + ret + + .p2align 4 +L(unaligned_no_match_1): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + jbe L(return_null_1) + PUSH (%edi) + lea 16(%eax), %edi + and $15, %eax + and $-16, %edi + add %eax, %edx +# else + lea 16(%eax), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(return_null_1): + xor %eax, %eax + ret + +# ifndef USE_AS_RAWMEMCHR + CFI_POP (%edi) +# endif + + .p2align 4 +L(crosscache): +/* Handle unaligned string. */ + +# ifndef USE_AS_RAWMEMCHR + PUSH (%edi) + mov %eax, %edi + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 +# else + mov %eax, %edx + and $15, %ecx + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + sub %eax, %edx + jbe L(return_null) + add %edi, %eax + add %ecx, %eax + RETURN +# else + add %edx, %eax + add %ecx, %eax + ret +# endif + + .p2align 4 +L(unaligned_no_match): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + add %ecx, %edx + jle L(return_null) + add $16, %edi +# else + add $16, %edx +# endif + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + test $0x3f, %edi +# else + test $0x3f, %edx +# endif + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm3 +# else + movdqa 48(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + + pcmpeqb %xmm1, %xmm3 + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + lea 48(%edi, %eax), %eax + RETURN +# else + lea 48(%edx, %eax), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_loop_32): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 16(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + RETURN +# endif + .p2align 4 +L(matches0): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea -16(%eax, %edi), %eax + RETURN +# else + lea -16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + add %edi, %eax + RETURN +# else + add %edx, %eax + ret +# endif + + .p2align 4 +L(matches16): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 16(%eax, %edi), %eax + RETURN +# else + lea 16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches32): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 32(%eax, %edi), %eax + RETURN +# else + lea 32(%eax, %edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + add %edi, %eax + RETURN + + .p2align 4 +L(matches16_1): + sub $16, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 16(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches32_1): + sub $32, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 32(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches48_1): + sub $48, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 48(%edi, %eax), %eax + RETURN +# endif + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S new file mode 100644 index 000000000..63d1d5d7b --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memchr-sse2.S @@ -0,0 +1,706 @@ +/* Optimized memchr with sse2 without bsf + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef USE_AS_RAWMEMCHR +# define ENTRANCE PUSH(%edi); +# define PARMS 8 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# else +# define ENTRANCE +# define PARMS 4 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif + + atom_text_section +ENTRY (MEMCHR) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) +# endif + + punpcklbw %xmm1, %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov %ecx, %edi +# else + mov %ecx, %edx +# endif + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + +# ifndef USE_AS_RAWMEMCHR + movdqu (%edi), %xmm0 +# else + movdqu (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx +# else + jnz L(match_case1_prolog) + lea 16(%edx), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx +# ifndef USE_AS_RAWMEMCHR + and $-16, %edi + movdqa (%edi), %xmm0 +# else + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog1) + lea -16(%edx), %edx + add %ecx, %edx + jle L(return_null) + lea 16(%edi), %edi +# else + jnz L(match_case1_prolog1) + lea 16(%edx), %edx +# endif + + .p2align 4 +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 +# else + lea 64(%edx), %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + lea 64(%edx), %edx + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): +# ifndef USE_AS_RAWMEMCHR + add %ecx, %edi +# else +L(match_case1_prolog1): + add %ecx, %edx +L(match_case1_prolog): +# endif + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(ExitCase1_1) + test $0x02, %al + jnz L(ExitCase1_2) + test $0x04, %al + jnz L(ExitCase1_3) +# ifndef USE_AS_RAWMEMCHR + lea 3(%edi), %eax + RETURN +# else + lea 3(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(ExitCase1_5) + test $0x20, %al + jnz L(ExitCase1_6) + test $0x40, %al + jnz L(ExitCase1_7) +# ifndef USE_AS_RAWMEMCHR + lea 7(%edi), %eax + RETURN +# else + lea 7(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(ExitCase1_9) + test $0x02, %ah + jnz L(ExitCase1_10) + test $0x04, %ah + jnz L(ExitCase1_11) +# ifndef USE_AS_RAWMEMCHR + lea 11(%edi), %eax + RETURN +# else + lea 11(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(ExitCase1_13) + test $0x20, %ah + jnz L(ExitCase1_14) + test $0x40, %ah + jnz L(ExitCase1_15) +# ifndef USE_AS_RAWMEMCHR + lea 15(%edi), %eax + RETURN +# else + lea 15(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN +# endif + + .p2align 4 +L(ExitCase1_1): +# ifndef USE_AS_RAWMEMCHR + mov %edi, %eax + RETURN +# else + mov %edx, %eax + ret +# endif + + .p2align 4 +L(ExitCase1_2): +# ifndef USE_AS_RAWMEMCHR + lea 1(%edi), %eax + RETURN +# else + lea 1(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_3): +# ifndef USE_AS_RAWMEMCHR + lea 2(%edi), %eax + RETURN +# else + lea 2(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_5): +# ifndef USE_AS_RAWMEMCHR + lea 4(%edi), %eax + RETURN +# else + lea 4(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_6): +# ifndef USE_AS_RAWMEMCHR + lea 5(%edi), %eax + RETURN +# else + lea 5(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_7): +# ifndef USE_AS_RAWMEMCHR + lea 6(%edi), %eax + RETURN +# else + lea 6(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_9): +# ifndef USE_AS_RAWMEMCHR + lea 8(%edi), %eax + RETURN +# else + lea 8(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_10): +# ifndef USE_AS_RAWMEMCHR + lea 9(%edi), %eax + RETURN +# else + lea 9(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_11): +# ifndef USE_AS_RAWMEMCHR + lea 10(%edi), %eax + RETURN +# else + lea 10(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_13): +# ifndef USE_AS_RAWMEMCHR + lea 12(%edi), %eax + RETURN +# else + lea 12(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_14): +# ifndef USE_AS_RAWMEMCHR + lea 13(%edi), %eax + RETURN +# else + lea 13(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_15): +# ifndef USE_AS_RAWMEMCHR + lea 14(%edi), %eax + RETURN +# else + lea 14(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(ExitCase2_1) + test $0x02, %al + jnz L(ExitCase2_2) + test $0x04, %al + jnz L(ExitCase2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(ExitCase2_5) + test $0x20, %al + jnz L(ExitCase2_6) + test $0x40, %al + jnz L(ExitCase2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(ExitCase2_9) + test $0x02, %ah + jnz L(ExitCase2_10) + test $0x04, %ah + jnz L(ExitCase2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(ExitCase2_13) + test $0x20, %ah + jnz L(ExitCase2_14) + test $0x40, %ah + jnz L(ExitCase2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(ExitCase2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN +# endif + + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memchr.S b/libc/sysdeps/i386/i686/multiarch/memchr.S new file mode 100644 index 000000000..163a83e17 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memchr.S @@ -0,0 +1,99 @@ +/* Multiple versions of memchr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + + .text +ENTRY(__memchr) + .type __memchr, @gnu_indirect_function + pushl %ebx + CFI_PUSH (%ebx) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features + +1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx) + jz 3f + + leal __memchr_sse2@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +2: leal __memchr_ia32@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +3: leal __memchr_sse2_bsf@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret +END(__memchr) + +weak_alias(__memchr, memchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memchr_ia32, @function; \ + .globl __memchr_ia32; \ + .p2align 4; \ + __memchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memchr_ia32, .-__memchr_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memchr; __GI_memchr = __memchr_ia32 + +#endif +#include "../../memchr.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S index b1ed778f1..1f5dbc15c 100644 --- a/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ b/libc/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ -/* memcmp with SSE4.2 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,84 +20,97 @@ #ifndef NOT_IN_libc -#include <sysdep.h> -#include "asm-syntax.h" +# include <sysdep.h> -#ifndef MEMCMP -# define MEMCMP __memcmp_sse4_2 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) -#ifdef SHARED -# define JMPTBL(I, B) I - B +# ifdef SHARED +# define JMPTBL(I, B) I - B /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table and adjuested EDX/ESI. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define JMPTBL(I, B) I + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ .section .text.sse4.2,"ax",@progbits ENTRY (MEMCMP) movl BLK1(%esp), %eax movl BLK2(%esp), %edx movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else cmp $1, %ecx jbe L(less1bytes) +# endif + pxor %xmm0, %xmm0 cmp $64, %ecx ja L(64bytesormore) cmp $8, %ecx - PUSH (%ebx) + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else jb L(less8bytes) + PUSH (%ebx) +# endif + add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less8bytes): mov (%eax), %bl cmpb (%edx), %bl @@ -141,22 +154,49 @@ L(less8bytes): mov 6(%eax), %bl cmpb 6(%edx), %bl je L(0bytes) + L(nonzero): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(above) neg %eax L(above): ret CFI_PUSH (%ebx) +# endif - ALIGN (4) + .p2align 4 L(0bytes): - POP (%ebx) + POP (%ebx) xor %eax, %eax ret - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(0bytesend) movzbl (%eax), %eax @@ -164,14 +204,14 @@ L(less1bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(0bytesend): xor %eax, %eax ret - - ALIGN (4) +# endif + .p2align 4 L(64bytesormore): - PUSH (%ebx) + PUSH (%ebx) mov %ecx, %ebx mov $64, %ecx sub $64, %ebx @@ -208,7 +248,14 @@ L(64bytesormore_loop): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 L(find_16diff): sub $16, %ecx L(find_32diff): @@ -218,9 +265,9 @@ L(find_48diff): L(find_64diff): add %ecx, %edx add %ecx, %eax - jmp L(16bytes) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(16bytes): mov -16(%eax), %ecx mov -16(%edx), %ebx @@ -243,8 +290,30 @@ L(4bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(49bytes): movdqu -49(%eax), %xmm1 movdqu -49(%edx), %xmm2 @@ -285,7 +354,7 @@ L(5bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(50bytes): mov $-50, %ebx movdqu -50(%eax), %xmm1 @@ -330,7 +399,7 @@ L(2bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(51bytes): mov $-51, %ebx movdqu -51(%eax), %xmm1 @@ -378,8 +447,8 @@ L(1bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(52bytes): movdqu -52(%eax), %xmm1 movdqu -52(%edx), %xmm2 @@ -402,13 +471,18 @@ L(20bytes): ptest %xmm2, %xmm0 jnc L(less16bytes) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(53bytes): movdqu -53(%eax), %xmm1 movdqu -53(%edx), %xmm2 @@ -440,7 +514,7 @@ L(21bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(54bytes): movdqu -54(%eax), %xmm1 movdqu -54(%edx), %xmm2 @@ -476,7 +550,7 @@ L(22bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(55bytes): movdqu -55(%eax), %xmm1 movdqu -55(%edx), %xmm2 @@ -513,8 +587,8 @@ L(23bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(56bytes): movdqu -56(%eax), %xmm1 movdqu -56(%edx), %xmm2 @@ -538,18 +612,27 @@ L(24bytes): jnc L(less16bytes) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(57bytes): movdqu -57(%eax), %xmm1 movdqu -57(%edx), %xmm2 @@ -585,7 +668,7 @@ L(25bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(58bytes): movdqu -58(%eax), %xmm1 movdqu -58(%edx), %xmm2 @@ -627,7 +710,7 @@ L(26bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(59bytes): movdqu -59(%eax), %xmm1 movdqu -59(%edx), %xmm2 @@ -668,8 +751,8 @@ L(27bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(60bytes): movdqu -60(%eax), %xmm1 movdqu -60(%edx), %xmm2 @@ -691,22 +774,38 @@ L(28bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(61bytes): movdqu -61(%eax), %xmm1 movdqu -61(%edx), %xmm2 @@ -749,7 +848,7 @@ L(29bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(62bytes): movdqu -62(%eax), %xmm1 movdqu -62(%edx), %xmm2 @@ -792,7 +891,7 @@ L(30bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(63bytes): movdqu -63(%eax), %xmm1 movdqu -63(%edx), %xmm2 @@ -838,8 +937,9 @@ L(31bytes): mov $0, %eax jne L(end) RETURN +# endif - ALIGN (4) + .p2align 4 L(64bytes): movdqu -64(%eax), %xmm1 movdqu -64(%edx), %xmm2 @@ -863,28 +963,45 @@ L(32bytes): jnc L(less16bytes) mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -16(%edx), %ebx cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif jne L(find_diff) mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less16bytes): add %ebx, %eax add %ebx, %edx @@ -910,9 +1027,35 @@ L(less16bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) + .p2align 4 L(find_diff): +# ifndef USE_AS_WMEMCMP cmpb %bl, %cl jne L(end) cmp %bx, %cx @@ -923,17 +1066,29 @@ L(find_diff): jne L(end) cmp %bx, %cx L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif END (MEMCMP) .section .rodata.sse4.2,"a",@progbits - ALIGN (2) + .p2align 2 .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -1000,5 +1155,72 @@ L(table_64bytes): .int JMPTBL (L(62bytes), L(table_64bytes)) .int JMPTBL (L(63bytes), L(table_64bytes)) .int JMPTBL (L(64bytes), L(table_64bytes)) - .size L(table_64bytes), .-L(table_64bytes) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif #endif diff --git a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index 2e0d15fe5..eab85c1de 100644 --- a/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/libc/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ -/* memcmp with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,47 +20,64 @@ #ifndef NOT_IN_libc -#include <sysdep.h> -#include "asm-syntax.h" +# include <sysdep.h> -#ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ - .section .text.ssse3,"ax",@progbits + atom_text_section ENTRY (MEMCMP) movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + movl BLK1(%esp), %eax cmp $48, %ecx movl BLK2(%esp), %edx jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP cmp $1, %ecx jbe L(less1bytes) - PUSH (%ebx) +# endif + + PUSH (%ebx) add %ecx, %edx add %ecx, %eax jmp L(less48bytes) - ALIGN (4) - CFI_POP (%ebx) + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(zero) movb (%eax), %cl @@ -71,29 +88,30 @@ L(less1bytes): neg %eax L(1bytesend): ret +# endif - ALIGN (4) + .p2align 4 L(zero): - mov $0, %eax + xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) cfi_remember_state - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 movl %eax, %edi movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx lea 16(%edi), %edi - sub $0xffff, %edx + sub $0xffff, %edx lea 16(%esi), %esi - jnz L(less16bytes) + jnz L(less16bytes) mov %edi, %edx and $0xf, %edx xor %edx, %edi @@ -104,6 +122,7 @@ L(48bytesormore): jz L(shr_0) xor %edx, %esi +# ifndef USE_AS_WMEMCMP cmp $8, %edx jae L(next_unaligned_table) cmp $0, %edx @@ -122,7 +141,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (4) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -139,8 +158,17 @@ L(next_unaligned_table): cmp $14, %edx je L(shr_14) jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %ecx jae L(shr_0_gobble) @@ -159,13 +187,13 @@ L(shr_0): lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_0_gobble): lea -48(%ecx), %ecx movdqa (%esi), %xmm0 @@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next): jnz L(exit) lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %ecx lea -48(%ecx), %ecx @@ -235,13 +264,13 @@ L(shr_1): jnz L(exit) lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -288,14 +317,14 @@ L(shr_1_gobble_next): lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %ecx lea -48(%ecx), %ecx @@ -319,13 +348,13 @@ L(shr_2): jnz L(exit) lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -372,13 +401,13 @@ L(shr_2_gobble_next): lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %ecx lea -48(%ecx), %ecx @@ -402,13 +431,13 @@ L(shr_3): jnz L(exit) lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -455,13 +484,14 @@ L(shr_3_gobble_next): lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %ecx lea -48(%ecx), %ecx @@ -485,13 +515,13 @@ L(shr_4): jnz L(exit) lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -538,13 +568,14 @@ L(shr_4_gobble_next): lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %ecx lea -48(%ecx), %ecx @@ -568,13 +599,13 @@ L(shr_5): jnz L(exit) lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -621,13 +652,13 @@ L(shr_5_gobble_next): lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %ecx lea -48(%ecx), %ecx @@ -651,13 +682,13 @@ L(shr_6): jnz L(exit) lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -704,13 +735,13 @@ L(shr_6_gobble_next): lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %ecx lea -48(%ecx), %ecx @@ -734,13 +765,13 @@ L(shr_7): jnz L(exit) lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -787,13 +818,14 @@ L(shr_7_gobble_next): lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %ecx lea -48(%ecx), %ecx @@ -817,13 +849,13 @@ L(shr_8): jnz L(exit) lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -870,13 +902,14 @@ L(shr_8_gobble_next): lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %ecx lea -48(%ecx), %ecx @@ -900,13 +933,13 @@ L(shr_9): jnz L(exit) lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -953,13 +986,13 @@ L(shr_9_gobble_next): lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %ecx lea -48(%ecx), %ecx @@ -983,13 +1016,13 @@ L(shr_10): jnz L(exit) lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1036,13 +1069,13 @@ L(shr_10_gobble_next): lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1066,13 +1099,13 @@ L(shr_11): jnz L(exit) lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1119,13 +1152,14 @@ L(shr_11_gobble_next): lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1149,13 +1183,13 @@ L(shr_12): jnz L(exit) lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1202,13 +1236,14 @@ L(shr_12_gobble_next): lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1232,13 +1267,13 @@ L(shr_13): jnz L(exit) lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1285,13 +1320,13 @@ L(shr_13_gobble_next): lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1315,13 +1350,13 @@ L(shr_14): jnz L(exit) lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1368,13 +1403,13 @@ L(shr_14_gobble_next): lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1398,13 +1433,13 @@ L(shr_15): jnz L(exit) lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1451,13 +1486,14 @@ L(shr_15_gobble_next): lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %ebx sub $0xffff, %ebx @@ -1465,9 +1501,12 @@ L(exit): lea -16(%esi), %esi lea -16(%edi), %edi mov %ebx, %edx + L(first16bytes): add %eax, %esi L(less16bytes): + +# ifndef USE_AS_WMEMCMP test %dl, %dl jz L(next_24_bytes) @@ -1492,61 +1531,61 @@ L(less16bytes): test $0x40, %dl jnz L(Byte22) L(Byte23): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte16): - movzbl -16(%edi), %eax - movzbl -16(%esi), %edx + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte17): - movzbl -15(%edi), %eax - movzbl -15(%esi), %edx + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte18): - movzbl -14(%edi), %eax - movzbl -14(%esi), %edx + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte19): - movzbl -13(%edi), %eax - movzbl -13(%esi), %edx + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte20): - movzbl -12(%edi), %eax - movzbl -12(%esi), %edx + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte21): - movzbl -11(%edi), %eax - movzbl -11(%esi), %edx + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte22): - movzbl -10(%edi), %eax - movzbl -10(%esi), %edx + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%edi), %edi lea 8(%esi), %esi @@ -1571,20 +1610,69 @@ L(next_24_bytes): test $0x40, %dh jnz L(Byte22) - ALIGN (4) + .p2align 4 L(Byte31): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif CFI_PUSH (%ebx) - ALIGN (4) + + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) cmp $8, %ecx je L(8bytes) +# ifndef USE_AS_WMEMCMP cmp $9, %ecx je L(9bytes) cmp $10, %ecx @@ -1598,13 +1686,17 @@ L(more8bytes): cmp $14, %ecx je L(14bytes) jmp L(15bytes) +# else + jmp L(12bytes) +# endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) cmp $16, %ecx je L(16bytes) +# ifndef USE_AS_WMEMCMP cmp $17, %ecx je L(17bytes) cmp $18, %ecx @@ -1618,13 +1710,17 @@ L(more16bytes): cmp $22, %ecx je L(22bytes) jmp L(23bytes) +# else + jmp L(20bytes) +# endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) cmp $24, %ecx je L(24bytes) +# ifndef USE_AS_WMEMCMP cmp $25, %ecx je L(25bytes) cmp $26, %ecx @@ -1638,13 +1734,17 @@ L(more24bytes): cmp $30, %ecx je L(30bytes) jmp L(31bytes) +# else + jmp L(28bytes) +# endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) cmp $32, %ecx je L(32bytes) +# ifndef USE_AS_WMEMCMP cmp $33, %ecx je L(33bytes) cmp $34, %ecx @@ -1658,11 +1758,35 @@ L(more32bytes): cmp $38, %ecx je L(38bytes) jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) +# ifndef USE_AS_WMEMCMP cmp $41, %ecx je L(41bytes) cmp $42, %ecx @@ -1677,23 +1801,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) - - ALIGN (4) + .p2align 4 L(44bytes): mov -44(%eax), %ecx mov -44(%edx), %ebx @@ -1750,11 +1858,64 @@ L(4bytes): cmp %ebx, %ecx mov $0, %eax jne L(find_diff) - POP (%ebx) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) ret CFI_PUSH (%ebx) +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + + .p2align 4 L(45bytes): mov -45(%eax), %ecx mov -45(%edx), %ebx @@ -1814,11 +1975,11 @@ L(5bytes): cmp -1(%edx), %cl mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(46bytes): mov -46(%eax), %ecx mov -46(%edx), %ebx @@ -1882,11 +2043,11 @@ L(2bytes): cmp %bh, %ch mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%eax), %ecx movl -47(%edx), %ebx @@ -1953,11 +2114,11 @@ L(3bytes): cmpb -1(%edx), %al mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(find_diff): cmpb %bl, %cl jne L(end) @@ -1968,14 +2129,30 @@ L(find_diff): cmp %bl, %cl jne L(end) cmp %bx, %cx + + .p2align 4 L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else -END (MEMCMP) +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) #endif diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index f64f8d214..26471fc0e 100644 --- a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -1,5 +1,5 @@ /* memcpy with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -235,7 +235,7 @@ L(shl_0_end): add %edi, %edx add %edi, %eax POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) CFI_PUSH (%edi) L(shl_0_gobble): @@ -385,7 +385,7 @@ L(shl_0_mem_less_32bytes): L(shl_0_mem_less_16bytes): add %ecx, %edx add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) cfi_restore_state cfi_remember_state @@ -1065,38 +1065,48 @@ L(shl_15_end): ALIGN (4) L(fwd_write_44bytes): - movl -44(%eax), %ecx - movl %ecx, -44(%edx) -L(fwd_write_40bytes): - movl -40(%eax), %ecx - movl %ecx, -40(%edx) + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) L(fwd_write_36bytes): - movl -36(%eax), %ecx - movl %ecx, -36(%edx) -L(fwd_write_32bytes): - movl -32(%eax), %ecx - movl %ecx, -32(%edx) + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) L(fwd_write_28bytes): - movl -28(%eax), %ecx - movl %ecx, -28(%edx) -L(fwd_write_24bytes): - movl -24(%eax), %ecx - movl %ecx, -24(%edx) + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) L(fwd_write_20bytes): - movl -20(%eax), %ecx - movl %ecx, -20(%edx) -L(fwd_write_16bytes): - movl -16(%eax), %ecx - movl %ecx, -16(%edx) + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) L(fwd_write_12bytes): - movl -12(%eax), %ecx - movl %ecx, -12(%edx) -L(fwd_write_8bytes): - movl -8(%eax), %ecx - movl %ecx, -8(%edx) + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) L(fwd_write_4bytes): movl -4(%eax), %ecx movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) L(fwd_write_0bytes): #ifndef USE_AS_BCOPY # ifdef USE_AS_MEMPCPY @@ -1124,37 +1134,49 @@ L(fwd_write_5bytes): ALIGN (4) L(fwd_write_45bytes): - movl -45(%eax), %ecx - movl %ecx, -45(%edx) -L(fwd_write_41bytes): - movl -41(%eax), %ecx - movl %ecx, -41(%edx) + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) L(fwd_write_37bytes): - movl -37(%eax), %ecx - movl %ecx, -37(%edx) -L(fwd_write_33bytes): - movl -33(%eax), %ecx - movl %ecx, -33(%edx) + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) L(fwd_write_29bytes): - movl -29(%eax), %ecx - movl %ecx, -29(%edx) -L(fwd_write_25bytes): - movl -25(%eax), %ecx - movl %ecx, -25(%edx) + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) L(fwd_write_21bytes): - movl -21(%eax), %ecx - movl %ecx, -21(%edx) -L(fwd_write_17bytes): - movl -17(%eax), %ecx - movl %ecx, -17(%edx) + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) L(fwd_write_13bytes): - movl -13(%eax), %ecx - movl %ecx, -13(%edx) -L(fwd_write_9bytes): - movl -9(%eax), %ecx - movl %ecx, -9(%edx) + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) movl -5(%eax), %ecx movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) L(fwd_write_1bytes): movzbl -1(%eax), %ecx movb %cl, -1(%edx) @@ -1169,38 +1191,50 @@ L(fwd_write_1bytes): ALIGN (4) L(fwd_write_46bytes): - movl -46(%eax), %ecx - movl %ecx, -46(%edx) -L(fwd_write_42bytes): - movl -42(%eax), %ecx - movl %ecx, -42(%edx) + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) L(fwd_write_38bytes): - movl -38(%eax), %ecx - movl %ecx, -38(%edx) -L(fwd_write_34bytes): - movl -34(%eax), %ecx - movl %ecx, -34(%edx) + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) L(fwd_write_30bytes): - movl -30(%eax), %ecx - movl %ecx, -30(%edx) -L(fwd_write_26bytes): - movl -26(%eax), %ecx - movl %ecx, -26(%edx) + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) L(fwd_write_22bytes): - movl -22(%eax), %ecx - movl %ecx, -22(%edx) -L(fwd_write_18bytes): - movl -18(%eax), %ecx - movl %ecx, -18(%edx) + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) L(fwd_write_14bytes): - movl -14(%eax), %ecx - movl %ecx, -14(%edx) -L(fwd_write_10bytes): - movl -10(%eax), %ecx - movl %ecx, -10(%edx) + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) L(fwd_write_6bytes): movl -6(%eax), %ecx movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) L(fwd_write_2bytes): movzwl -2(%eax), %ecx movw %cx, -2(%edx) @@ -1215,38 +1249,52 @@ L(fwd_write_2bytes): ALIGN (4) L(fwd_write_47bytes): - movl -47(%eax), %ecx - movl %ecx, -47(%edx) -L(fwd_write_43bytes): - movl -43(%eax), %ecx - movl %ecx, -43(%edx) + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) L(fwd_write_39bytes): - movl -39(%eax), %ecx - movl %ecx, -39(%edx) -L(fwd_write_35bytes): - movl -35(%eax), %ecx - movl %ecx, -35(%edx) + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) L(fwd_write_31bytes): - movl -31(%eax), %ecx - movl %ecx, -31(%edx) -L(fwd_write_27bytes): - movl -27(%eax), %ecx - movl %ecx, -27(%edx) + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) L(fwd_write_23bytes): - movl -23(%eax), %ecx - movl %ecx, -23(%edx) -L(fwd_write_19bytes): - movl -19(%eax), %ecx - movl %ecx, -19(%edx) + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) L(fwd_write_15bytes): - movl -15(%eax), %ecx - movl %ecx, -15(%edx) -L(fwd_write_11bytes): - movl -11(%eax), %ecx - movl %ecx, -11(%edx) + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) L(fwd_write_7bytes): movl -7(%eax), %ecx movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) L(fwd_write_3bytes): movzwl -3(%eax), %ecx movzbl -1(%eax), %eax @@ -1259,6 +1307,356 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif + RETURN + + ALIGN (4) +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif RETURN_END cfi_restore_state @@ -1330,35 +1728,20 @@ L(large_page_less_32bytes): ALIGN (4) L(bk_write_44bytes): - movl 40(%eax), %ecx - movl %ecx, 40(%edx) -L(bk_write_40bytes): - movl 36(%eax), %ecx - movl %ecx, 36(%edx) + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) L(bk_write_36bytes): - movl 32(%eax), %ecx - movl %ecx, 32(%edx) -L(bk_write_32bytes): - movl 28(%eax), %ecx - movl %ecx, 28(%edx) + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) L(bk_write_28bytes): - movl 24(%eax), %ecx - movl %ecx, 24(%edx) -L(bk_write_24bytes): - movl 20(%eax), %ecx - movl %ecx, 20(%edx) + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) L(bk_write_20bytes): - movl 16(%eax), %ecx - movl %ecx, 16(%edx) -L(bk_write_16bytes): - movl 12(%eax), %ecx - movl %ecx, 12(%edx) + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) L(bk_write_12bytes): - movl 8(%eax), %ecx - movl %ecx, 8(%edx) -L(bk_write_8bytes): - movl 4(%eax), %ecx - movl %ecx, 4(%edx) + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) L(bk_write_4bytes): movl (%eax), %ecx movl %ecx, (%edx) @@ -1373,36 +1756,46 @@ L(bk_write_0bytes): RETURN ALIGN (4) +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) L(bk_write_45bytes): - movl 41(%eax), %ecx - movl %ecx, 41(%edx) -L(bk_write_41bytes): - movl 37(%eax), %ecx - movl %ecx, 37(%edx) + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) L(bk_write_37bytes): - movl 33(%eax), %ecx - movl %ecx, 33(%edx) -L(bk_write_33bytes): - movl 29(%eax), %ecx - movl %ecx, 29(%edx) + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) L(bk_write_29bytes): - movl 25(%eax), %ecx - movl %ecx, 25(%edx) -L(bk_write_25bytes): - movl 21(%eax), %ecx - movl %ecx, 21(%edx) + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) L(bk_write_21bytes): - movl 17(%eax), %ecx - movl %ecx, 17(%edx) -L(bk_write_17bytes): - movl 13(%eax), %ecx - movl %ecx, 13(%edx) + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) L(bk_write_13bytes): - movl 9(%eax), %ecx - movl %ecx, 9(%edx) -L(bk_write_9bytes): - movl 5(%eax), %ecx - movl %ecx, 5(%edx) + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) L(bk_write_5bytes): movl 1(%eax), %ecx movl %ecx, 1(%edx) @@ -1419,39 +1812,78 @@ L(bk_write_1bytes): RETURN ALIGN (4) +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) L(bk_write_46bytes): - movl 42(%eax), %ecx - movl %ecx, 42(%edx) -L(bk_write_42bytes): - movl 38(%eax), %ecx - movl %ecx, 38(%edx) + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) L(bk_write_38bytes): - movl 34(%eax), %ecx - movl %ecx, 34(%edx) -L(bk_write_34bytes): - movl 30(%eax), %ecx - movl %ecx, 30(%edx) + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) L(bk_write_30bytes): - movl 26(%eax), %ecx - movl %ecx, 26(%edx) -L(bk_write_26bytes): - movl 22(%eax), %ecx - movl %ecx, 22(%edx) + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) L(bk_write_22bytes): - movl 18(%eax), %ecx - movl %ecx, 18(%edx) -L(bk_write_18bytes): - movl 14(%eax), %ecx - movl %ecx, 14(%edx) + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) L(bk_write_14bytes): - movl 10(%eax), %ecx - movl %ecx, 10(%edx) -L(bk_write_10bytes): - movl 6(%eax), %ecx - movl %ecx, 6(%edx) + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) L(bk_write_6bytes): movl 2(%eax), %ecx movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) L(bk_write_2bytes): movzwl (%eax), %ecx movw %cx, (%edx) @@ -1466,38 +1898,52 @@ L(bk_write_2bytes): ALIGN (4) L(bk_write_47bytes): - movl 43(%eax), %ecx - movl %ecx, 43(%edx) -L(bk_write_43bytes): - movl 39(%eax), %ecx - movl %ecx, 39(%edx) + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) L(bk_write_39bytes): - movl 35(%eax), %ecx - movl %ecx, 35(%edx) -L(bk_write_35bytes): - movl 31(%eax), %ecx - movl %ecx, 31(%edx) + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) L(bk_write_31bytes): - movl 27(%eax), %ecx - movl %ecx, 27(%edx) -L(bk_write_27bytes): - movl 23(%eax), %ecx - movl %ecx, 23(%edx) + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) L(bk_write_23bytes): - movl 19(%eax), %ecx - movl %ecx, 19(%edx) -L(bk_write_19bytes): - movl 15(%eax), %ecx - movl %ecx, 15(%edx) + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) L(bk_write_15bytes): - movl 11(%eax), %ecx - movl %ecx, 11(%edx) -L(bk_write_11bytes): - movl 7(%eax), %ecx - movl %ecx, 7(%edx) + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) L(bk_write_7bytes): movl 3(%eax), %ecx movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) L(bk_write_3bytes): movzwl 1(%eax), %ecx movw %cx, 1(%edx) @@ -1566,6 +2012,57 @@ L(table_48bytes_fwd): .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) ALIGN (2) +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + ALIGN (2) L(shl_table): .int JMPTBL (L(shl_0), L(shl_table)) .int JMPTBL (L(shl_1), L(shl_table)) @@ -1658,22 +2155,14 @@ L(bk_write_64bytesless): L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ sub $32, %ecx - movl -4(%esi), %eax - movl %eax, -4(%edx) - movl -8(%esi), %eax - movl %eax, -8(%edx) - movl -12(%esi), %eax - movl %eax, -12(%edx) - movl -16(%esi), %eax - movl %eax, -16(%edx) - movl -20(%esi), %eax - movl %eax, -20(%edx) - movl -24(%esi), %eax - movl %eax, -24(%edx) - movl -28(%esi), %eax - movl %eax, -28(%edx) - movl -32(%esi), %eax - movl %eax, -32(%edx) + movq -8(%esi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%esi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%esi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%esi), %xmm0 + movq %xmm0, -32(%edx) sub $32, %edx sub $32, %esi diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-c.c b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c new file mode 100644 index 000000000..44ec1a6ed --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memrchr-c.c @@ -0,0 +1,7 @@ +#ifndef NOT_IN_libc +# define MEMRCHR __memrchr_ia32 +# include <string.h> +extern void *__memrchr_ia32 (const void *, int, size_t); +#endif + +#include "string/memrchr.c" diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 000000000..355d498e2 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,418 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S new file mode 100644 index 000000000..86a0cf961 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memrchr-sse2.S @@ -0,0 +1,725 @@ +/* Optimized memrchr with sse2 without bsf + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__memrchr_sse2) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + lea 16(%ecx), %ecx + lea 16(%edx), %edx + sub %eax, %edx + and $-16, %ecx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(exit_dispatch) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + lea 64(%ecx), %ecx + lea 64(%edx), %edx + and $-64, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches16): + lea 16(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32): + lea 32(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48): + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch): + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_8): + test $0x80, %al + jnz L(exit_8) + test $0x40, %al + jnz L(exit_7) + test $0x20, %al + jnz L(exit_6) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(exit_dispatch_high_8) + test $0x08, %ah + jnz L(exit_12) + test $0x04, %ah + jnz L(exit_11) + test $0x02, %ah + jnz L(exit_10) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high_8): + test $0x80, %ah + jnz L(exit_16) + test $0x40, %ah + jnz L(exit_15) + test $0x20, %ah + jnz L(exit_14) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_2): + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_3): + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_4): + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_6): + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_7): + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_8): + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_10): + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_11): + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_12): + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_14): + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_15): + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_16): + lea 15(%ecx), %eax + ret + + .p2align 4 +L(matches0_1): + lea -64(%edx), %edx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + lea -48(%edx), %edx + lea 16(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32_1): + lea -32(%edx), %edx + lea 32(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48_1): + lea -16(%edx), %edx + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch_1): + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_1_8): + test $0x80, %al + jnz L(exit_1_8) + test $0x40, %al + jnz L(exit_1_7) + test $0x20, %al + jnz L(exit_1_6) + add $4, %edx + jl L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high): + mov %ah, %al + and $15 << 4, %al + jnz L(exit_dispatch_1_high_8) + test $0x08, %ah + jnz L(exit_1_12) + test $0x04, %ah + jnz L(exit_1_11) + test $0x02, %ah + jnz L(exit_1_10) + add $8, %edx + jl L(return_null) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high_8): + test $0x80, %ah + jnz L(exit_1_16) + test $0x40, %ah + jnz L(exit_1_15) + test $0x20, %ah + jnz L(exit_1_14) + add $12, %edx + jl L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_1_2): + add $1, %edx + jl L(return_null) + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_1_3): + add $2, %edx + jl L(return_null) + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_1_4): + add $3, %edx + jl L(return_null) + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_1_6): + add $5, %edx + jl L(return_null) + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_1_7): + add $6, %edx + jl L(return_null) + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_1_8): + add $7, %edx + jl L(return_null) + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_1_10): + add $9, %edx + jl L(return_null) + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_1_11): + add $10, %edx + jl L(return_null) + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_1_12): + add $11, %edx + jl L(return_null) + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_1_14): + add $13, %edx + jl L(return_null) + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_1_15): + add $14, %edx + jl L(return_null) + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_1_16): + add $15, %edx + jl L(return_null) + lea 15(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + mov %eax, %ecx + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jnz L(exit_dispatch) + + xor %eax, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + add $16, %edx + je L(return_null) + punpcklbw %xmm1, %xmm1 + + mov %ecx, %eax + pshufd $0, %xmm1, %xmm1 + + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (__memrchr_sse2) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memrchr.S b/libc/sysdeps/i386/i686/multiarch/memrchr.S new file mode 100644 index 000000000..8e5b2c50a --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memrchr.S @@ -0,0 +1,79 @@ +/* Multiple versions of memrchr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + pushl %ebx + CFI_PUSH (%ebx) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features + +1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx) + jz 3f + + leal __memrchr_sse2@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +2: leal __memrchr_ia32@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +3: leal __memrchr_sse2_bsf@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret +END(__memrchr) + +weak_alias(__memrchr, memrchr) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S new file mode 100644 index 000000000..88c0e5776 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2_bsf +#include "memchr-sse2-bsf.S" diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S new file mode 100644 index 000000000..038c74896 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2 +#include "memchr-sse2.S" diff --git a/libc/sysdeps/i386/i686/multiarch/rawmemchr.S b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S new file mode 100644 index 000000000..111f0dcf6 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/rawmemchr.S @@ -0,0 +1,99 @@ +/* Multiple versions of rawmemchr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + + .text +ENTRY(__rawmemchr) + .type __rawmemchr, @gnu_indirect_function + pushl %ebx + CFI_PUSH (%ebx) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features + +1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + testl $bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx) + jz 3f + + leal __rawmemchr_sse2@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +2: leal __rawmemchr_ia32@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret + + CFI_PUSH (%ebx) + +3: leal __rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax + popl %ebx + CFI_POP (%ebx) + ret +END(__rawmemchr) + +weak_alias(__rawmemchr, rawmemchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __rawmemchr_ia32, @function; \ + .globl __rawmemchr_ia32; \ + .p2align 4; \ + __rawmemchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32 + +# undef libc_hidden_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32 + +#endif +#include "../../rawmemchr.S" diff --git a/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c new file mode 100644 index 000000000..1aa544064 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/rtld-strnlen.c @@ -0,0 +1 @@ +#include <string/strnlen.c> diff --git a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S index a73b21ecc..9cc5ae8d1 100644 --- a/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S +++ b/libc/sysdeps/i386/i686/multiarch/strchr-sse2.S @@ -40,7 +40,7 @@ # define STR1 PARMS # define STR2 STR1+4 - .text + atom_text_section ENTRY (__strchr_sse2) ENTRANCE diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S index 0dc651f01..ce50e0a33 100644 --- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S +++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S @@ -1,5 +1,5 @@ /* strlen with SSE2 and BSF - Copyright (C) 2010 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -21,7 +21,6 @@ #if defined SHARED && !defined NOT_IN_libc #include <sysdep.h> -#include "asm-syntax.h" #define CFI_PUSH(REG) \ cfi_adjust_cfa_offset (4); \ diff --git a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S index ca549bafc..91b6d799c 100644 --- a/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S +++ b/libc/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -18,31 +18,46 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#if (defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc + # ifndef USE_AS_STRCAT # include <sysdep.h> -# include "asm-syntax.h" +# define PARMS 4 +# define STR PARMS +# define RETURN ret -# define CFI_PUSH(REG) \ +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -# define CFI_POP(REG) \ +# define CFI_POP(REG) \ cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) -# define PARMS 4 -# define STR PARMS -# define ENTRANCE -# define RETURN ret +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif - .text -ENTRY (__strlen_sse2) - ENTRANCE +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + atom_text_section +ENTRY (STRLEN) mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif # endif xor %eax, %eax cmpb $0, (%edx) @@ -53,6 +68,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail2) cmpb $0, 3(%edx) jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + cmpb $0, 4(%edx) jz L(exit_tail4) cmpb $0, 5(%edx) @@ -61,6 +82,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail6) cmpb $0, 7(%edx) jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + cmpb $0, 8(%edx) jz L(exit_tail8) cmpb $0, 9(%edx) @@ -69,6 +96,12 @@ ENTRY (__strlen_sse2) jz L(exit_tail10) cmpb $0, 11(%edx) jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + cmpb $0, 12(%edx) jz L(exit_tail12) cmpb $0, 13(%edx) @@ -77,11 +110,18 @@ ENTRY (__strlen_sse2) jz L(exit_tail14) cmpb $0, 15(%edx) jz L(exit_tail15) + pxor %xmm0, %xmm0 - mov %edx, %eax - lea 16(%edx), %ecx + lea 16(%edx), %eax + mov %eax, %ecx and $-16, %eax - add $16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx @@ -97,7 +137,6 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) - pcmpeqb (%eax), %xmm2 pmovmskb %xmm2, %edx pxor %xmm3, %xmm3 @@ -111,6 +150,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -135,6 +179,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -159,6 +208,11 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx @@ -183,8 +237,20 @@ ENTRY (__strlen_sse2) lea 16(%eax), %eax jnz L(exit) +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + and $-0x40, %eax -L(aligned_64): + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif movaps (%eax), %xmm0 movaps 16(%eax), %xmm1 movaps 32(%eax), %xmm2 @@ -196,7 +262,7 @@ L(aligned_64): pmovmskb %xmm2, %edx test %edx, %edx lea 64(%eax), %eax - jz L(aligned_64) + jz L(aligned_64_loop) pcmpeqb -64(%eax), %xmm3 pmovmskb %xmm3, %edx @@ -223,56 +289,348 @@ L(exit): sub %ecx, %eax test %dl, %dl jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) test $0x01, %dl jnz L(exit_tail0) - test $0x02, %dl jnz L(exit_tail1) - test $0x04, %dl jnz L(exit_tail2) + add $3, %eax + RETURN - test $0x08, %dl - jnz L(exit_tail3) - + .p2align 4 +L(exit_8): test $0x10, %dl jnz L(exit_tail4) - test $0x20, %dl jnz L(exit_tail5) - test $0x40, %dl jnz L(exit_tail6) add $7, %eax -L(exit_tail0): RETURN + .p2align 4 L(exit_high): - add $8, %eax + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN - test $0x02, %dh - jnz L(exit_tail1) + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + .p2align 4 +L(strnlen_exit_high_8): test $0x10, %dh - jnz L(exit_tail4) - + jnz L(strnlen_exit_tail12) test $0x20, %dh - jnz L(exit_tail5) - + jnz L(strnlen_exit_tail13) test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax RETURN .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 L(exit_tail1): add $1, %eax RETURN @@ -332,7 +690,7 @@ L(exit_tail14): L(exit_tail15): add $15, %eax # ifndef USE_AS_STRCAT - ret -END (__strlen_sse2) + RETURN +END (STRLEN) # endif #endif diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-c.c b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c new file mode 100644 index 000000000..567af2c81 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/strnlen-c.c @@ -0,0 +1,8 @@ +#ifndef NOT_IN_libc +# define STRNLEN __strnlen_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); +#endif + +#include "string/strnlen.c" diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S new file mode 100644 index 000000000..56b6ae2a5 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/strnlen-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2 +#include "strlen-sse2.S" diff --git a/libc/sysdeps/i386/i686/multiarch/strnlen.S b/libc/sysdeps/i386/i686/multiarch/strnlen.S new file mode 100644 index 000000000..7e542d9b7 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/strnlen.S @@ -0,0 +1,56 @@ +/* Multiple versions of strnlen + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __strnlen_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __strnlen_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(__strnlen) + +weak_alias(__strnlen, strnlen) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S index 71cc69dfe..f46b17fd7 100644 --- a/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S +++ b/libc/sysdeps/i386/i686/multiarch/strrchr-sse2.S @@ -40,7 +40,7 @@ # define STR1 PARMS # define STR2 STR1+4 - .text + atom_text_section ENTRY (__strrchr_sse2) ENTRANCE diff --git a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S index 404a9a4d4..61c43c38d 100644 --- a/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S +++ b/libc/sysdeps/i386/i686/multiarch/wcscmp-sse2.S @@ -21,7 +21,6 @@ #ifndef NOT_IN_libc # include <sysdep.h> -# include "asm-syntax.h" # define CFI_PUSH(REG) \ cfi_adjust_cfa_offset (4); \ @@ -34,18 +33,16 @@ # define PUSH(REG) pushl REG; CFI_PUSH (REG) # define POP(REG) popl REG; CFI_POP (REG) -# ifndef STRCMP -# define STRCMP __wcscmp_sse2 -# endif - # define ENTRANCE PUSH(%esi); PUSH(%edi) # define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); # define PARMS 4 # define STR1 PARMS # define STR2 STR1+4 +/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */ + .text -ENTRY (STRCMP) +ENTRY (__wcscmp_sse2) /* * This implementation uses SSE to compare up to 16 bytes at a time. */ @@ -264,20 +261,20 @@ L(continue_00_48): test %ecx, %ecx jnz L(less4_double_words1) - sub (%esi), %eax - jnz L(return) + cmp (%esi), %eax + jne L(nequal) mov 4(%edi), %eax - sub 4(%esi), %eax - jnz L(return) + cmp 4(%esi), %eax + jne L(nequal) mov 8(%edi), %eax - sub 8(%esi), %eax - jnz L(return) + cmp 8(%esi), %eax + jne L(nequal) mov 12(%edi), %eax - sub 12(%esi), %eax - jnz L(return) + cmp 12(%esi), %eax + jne L(nequal) movdqu 16(%esi), %xmm2 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ @@ -381,7 +378,7 @@ L(continue_32_48): movdqu 48(%esi), %xmm2 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + psubb %xmm0, %xmm1 /* packed sub of comparison results */ pmovmskb %xmm1, %edx sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ jnz L(less4_double_words_48) @@ -585,20 +582,20 @@ L(continue_48_00): test %ecx, %ecx jnz L(less4_double_words1) - sub (%esi), %eax - jnz L(return) + cmp (%esi), %eax + jne L(nequal) mov 4(%edi), %eax - sub 4(%esi), %eax - jnz L(return) + cmp 4(%esi), %eax + jne L(nequal) mov 8(%edi), %eax - sub 8(%esi), %eax - jnz L(return) + cmp 8(%esi), %eax + jne L(nequal) mov 12(%edi), %eax - sub 12(%esi), %eax - jnz L(return) + cmp 12(%esi), %eax + jne L(nequal) movdqu 16(%edi), %xmm1 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ @@ -839,142 +836,161 @@ L(less4_double_words1): test %ecx, %ecx jz L(equal) - mov 12(%esi), %edx - mov 12(%edi), %eax - sub %edx, %eax + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + xor %eax, %eax RETURN .p2align 4 L(less4_double_words): + xor %eax, %eax test %dl, %dl jz L(next_two_double_words) and $15, %dl jz L(second_double_word) - mov (%edi), %eax - sub (%esi), %eax + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) RETURN .p2align 4 L(second_double_word): - mov 4(%edi), %eax - sub 4(%esi), %eax + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) RETURN .p2align 4 L(next_two_double_words): and $15, %dh jz L(fourth_double_word) - mov 8(%edi), %eax - sub 8(%esi), %eax + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) RETURN .p2align 4 L(fourth_double_word): - mov 12(%edi), %eax - sub 12(%esi), %eax + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) RETURN .p2align 4 L(less4_double_words_16): + xor %eax, %eax test %dl, %dl jz L(next_two_double_words_16) and $15, %dl jz L(second_double_word_16) - mov 16(%edi), %eax - sub 16(%esi), %eax + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) RETURN .p2align 4 L(second_double_word_16): - mov 20(%edi), %eax - sub 20(%esi), %eax + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) RETURN .p2align 4 L(next_two_double_words_16): and $15, %dh jz L(fourth_double_word_16) - mov 24(%edi), %eax - sub 24(%esi), %eax + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) RETURN .p2align 4 L(fourth_double_word_16): - mov 28(%edi), %eax - sub 28(%esi), %eax + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) RETURN .p2align 4 L(less4_double_words_32): + xor %eax, %eax test %dl, %dl jz L(next_two_double_words_32) and $15, %dl jz L(second_double_word_32) - mov 32(%edi), %eax - sub 32(%esi), %eax + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) RETURN .p2align 4 L(second_double_word_32): - mov 36(%edi), %eax - sub 36(%esi), %eax + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) RETURN .p2align 4 L(next_two_double_words_32): and $15, %dh jz L(fourth_double_word_32) - mov 40(%edi), %eax - sub 40(%esi), %eax + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) RETURN .p2align 4 L(fourth_double_word_32): - mov 44(%edi), %eax - sub 44(%esi), %eax + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) RETURN .p2align 4 L(less4_double_words_48): + xor %eax, %eax test %dl, %dl jz L(next_two_double_words_48) and $15, %dl jz L(second_double_word_48) - mov 48(%edi), %eax - sub 48(%esi), %eax + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) RETURN .p2align 4 L(second_double_word_48): - mov 52(%edi), %eax - sub 52(%esi), %eax + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) RETURN .p2align 4 L(next_two_double_words_48): and $15, %dh jz L(fourth_double_word_48) - mov 56(%edi), %eax - sub 56(%esi), %eax + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) RETURN .p2align 4 L(fourth_double_word_48): - mov 60(%edi), %eax - sub 60(%esi), %eax - RETURN - - .p2align 4 -L(return): + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) RETURN .p2align 4 L(nequal): mov $1, %eax - ja L(nequal_bigger) + jg L(return) neg %eax + RETURN -L(nequal_bigger): + .p2align 4 +L(return): RETURN .p2align 4 @@ -988,7 +1004,7 @@ L(equal): .p2align 4 L(neq): mov $1, %eax - ja L(neq_bigger) + jg L(neq_bigger) neg %eax L(neq_bigger): @@ -999,5 +1015,5 @@ L(eq): xorl %eax, %eax ret -END (STRCMP) +END (__wcscmp_sse2) #endif diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-c.c b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c new file mode 100644 index 000000000..49f32a25e --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wcslen-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WCSLEN __wcslen_ia32 +#endif + +#include "wcsmbs/wcslen.c" diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S new file mode 100644 index 000000000..d41d62309 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wcslen-sse2.S @@ -0,0 +1,194 @@ +/* wcslen with SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> +# define STR 4 + + .text +ENTRY (__wcslen_sse2) + mov STR(%esp), %edx + + cmp $0, (%edx) + jz L(exit_tail0) + cmp $0, 4(%edx) + jz L(exit_tail1) + cmp $0, 8(%edx) + jz L(exit_tail2) + cmp $0, 12(%edx) + jz L(exit_tail3) + cmp $0, 16(%edx) + jz L(exit_tail4) + cmp $0, 20(%edx) + jz L(exit_tail5) + cmp $0, 24(%edx) + jz L(exit_tail6) + cmp $0, 28(%edx) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%edx), %eax + lea 16(%edx), %ecx + and $-16, %eax + + pcmpeqd (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqd -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %ecx, %eax + shr $2, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %eax + ret + + .p2align 4 +L(exit_1): + add $1, %eax + ret + + .p2align 4 +L(exit_3): + add $3, %eax + ret + + .p2align 4 +L(exit_tail0): + xor %eax, %eax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %eax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %eax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %eax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %eax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %eax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %eax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %eax + ret + +END (__wcslen_sse2) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/wcslen.S b/libc/sysdeps/i386/i686/multiarch/wcslen.S new file mode 100644 index 000000000..58670377e --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wcslen.S @@ -0,0 +1,56 @@ +/* Multiple versions of wcslen + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__wcslen) + .type __wcslen, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcslen_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcslen_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(__wcslen) + +weak_alias(__wcslen, wcslen) +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 000000000..94ff6151f --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WMEMCMP __wmemcmp_ia32 +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 000000000..1a857c7e2 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 000000000..a41ef95fc --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/libc/sysdeps/i386/i686/multiarch/wmemcmp.S b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 000000000..5080c14ea --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,59 @@ +/* Multiple versions of wmemcmp + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function + __i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(wmemcmp) +#endif diff --git a/libc/sysdeps/i386/sysdep.h b/libc/sysdeps/i386/sysdep.h index efdc82dde..a8a9e571b 100644 --- a/libc/sysdeps/i386/sysdep.h +++ b/libc/sysdeps/i386/sysdep.h @@ -1,5 +1,5 @@ /* Assembler macros for i386. - Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006 + Copyright (C) 1991-93,95,96,98,2002,2003,2005,2006,2011 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -167,4 +167,6 @@ __i686.get_pc_thunk.reg: \ #endif #endif +#define atom_text_section .section ".text.atom", "ax" + #endif /* __ASSEMBLER__ */ |