Merge google/gcc-4_9 (220271:227810] into google/gcc-4_9-mobile branch.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/google/gcc-4_9-mobile@228096 138bc75d-0d04-0410-961f-82ee72b054a4
author: ctice <ctice@138bc75d-0d04-0410-961f-82ee72b054a4> 2015-09-24 20:39:21 +0000
committer: ctice <ctice@138bc75d-0d04-0410-961f-82ee72b054a4> 2015-09-24 20:39:21 +0000
commit: 91cfdb1db6b46c452db5f1ceb411fe6355b4ebb0 (patch)
tree: 3a5e109099a823faa295796293079d514cfb6b24
parent: 868c80ab78c96ba97f5fdad11c889aaecafe2e9e (diff)
parent: 1bba763e8b31cd3e68de6aad23ad6a31f8b5b118 (diff)
334 files changed, 30772 insertions, 5083 deletions
diff --git a/configure b/configure
index e779af1f4aa..a7b80ffa0ae 100755
--- a/configure
+++ b/configure
@@ -3781,7 +3781,7 @@ case "${target}" in
   microblaze*)
     noconfigdirs="$noconfigdirs gprof"
     ;;
-  mips*-sde-elf* | mips*-mti-elf*)
+  mips*-sde-elf* | mips*-mti-elf* | mips*-img-elf*)
     if test x$with_newlib = xyes; then
       noconfigdirs="$noconfigdirs gprof"
     fi
@@ -7045,7 +7045,7 @@ case "${target}" in
   spu-*-*)
     target_makefile_frag="config/mt-spu"
     ;;
-  mips*-sde-elf* | mips*-mti-elf*)
+  mips*-sde-elf* | mips*-mti-elf* | mips*-img-elf*)
     target_makefile_frag="config/mt-sde"
     ;;
   mipsisa*-*-elfoabi*)
diff --git a/configure.ac b/configure.ac
index 9252aab0905..5dac5dae0cd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1109,7 +1109,7 @@ case "${target}" in
   microblaze*)
     noconfigdirs="$noconfigdirs gprof"
     ;;
-  mips*-sde-elf* | mips*-mti-elf*)
+  mips*-sde-elf* | mips*-mti-elf* | mips*-img-elf*)
     if test x$with_newlib = xyes; then
       noconfigdirs="$noconfigdirs gprof"
     fi
@@ -2377,7 +2377,7 @@ case "${target}" in
   spu-*-*)
     target_makefile_frag="config/mt-spu"
     ;;
-  mips*-sde-elf* | mips*-mti-elf*)
+  mips*-sde-elf* | mips*-mti-elf* | mips*-img-elf*)
     target_makefile_frag="config/mt-sde"
     ;;
   mipsisa*-*-elfoabi*)
diff --git a/contrib/testsuite-management/aarch64-grtev4-linux-gnu.xfail b/contrib/testsuite-management/aarch64-grtev4-linux-gnu.xfail
index a2a027e5338..a459149ee01 100644
--- a/contrib/testsuite-management/aarch64-grtev4-linux-gnu.xfail
+++ b/contrib/testsuite-management/aarch64-grtev4-linux-gnu.xfail
@@ -18,6 +18,10 @@ FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 20)
 FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 21)
 FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 22)
 
+# Also xfailed in x86; Google b/20184248
+FAIL: 17_intro/headers/c++1998/complex.cc (test for excess errors)
+FAIL: 26_numerics/complex/c99.cc (test for excess errors)
+
 # AArch64-specific; appear to be missing "loop turned into non-loop; it never loops" output.
 FAIL: gcc.dg/unroll_1.c (test for warnings, line 14)
 FAIL: gcc.dg/unroll_1.c (test for warnings, line 24)
diff --git a/contrib/testsuite-management/powerpc64le-grtev4-linux-gnu.xfail b/contrib/testsuite-management/powerpc64le-grtev4-linux-gnu.xfail
index 410cdb94311..d8995e0cd1f 100644
--- a/contrib/testsuite-management/powerpc64le-grtev4-linux-gnu.xfail
+++ b/contrib/testsuite-management/powerpc64le-grtev4-linux-gnu.xfail
@@ -18,6 +18,10 @@ FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 20)
 FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 21)
 FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 22)
 
+# Also xfailed in x86; Google b/20184248
+FAIL: 17_intro/headers/c++1998/complex.cc (test for excess errors)
+FAIL: 26_numerics/complex/c99.cc (test for excess errors)
+
 # PPCle-specific.  From PR33512, still failing in truck despite resolved PR.
 FAIL: gcc.dg/and-1.c scan-assembler-not nand
 
diff --git a/contrib/testsuite-management/x86_64-grtev4-linux-gnu.xfail b/contrib/testsuite-management/x86_64-grtev4-linux-gnu.xfail
index a6eb7eca038..b4fa6933791 100644
--- a/contrib/testsuite-management/x86_64-grtev4-linux-gnu.xfail
+++ b/contrib/testsuite-management/x86_64-grtev4-linux-gnu.xfail
@@ -21,6 +21,10 @@ FAIL: gcc.dg/wself-assign-1.c (test for warnings, line 22)
 # http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60037
 FAIL: ext/random/hypergeometric_distribution/operators/values.cc execution test
 
+# Google b/20184248
+FAIL: 17_intro/headers/c++1998/complex.cc (test for excess errors)
+FAIL: 26_numerics/complex/c99.cc (test for excess errors)
+
 # Google b/14137212
 FAIL: 29_atomics/atomic/cons/user_pod.cc (test for excess errors)
 
diff --git a/fixincludes/fixincl.x b/fixincludes/fixincl.x
index 10b4061f30a..75df492bcfa 100644
--- a/fixincludes/fixincl.x
+++ b/fixincludes/fixincl.x
@@ -2,11 +2,11 @@
  *
  * DO NOT EDIT THIS FILE   (fixincl.x)
  *
- * It has been AutoGen-ed  December  8, 2013 at 12:24:14 PM by AutoGen 5.18.2
+ * It has been AutoGen-ed  August 12, 2014 at 05:02:39 PM by AutoGen 5.18
  * From the definitions    inclhack.def
  * and the template file   fixincl
  */
-/* DO NOT SVN-MERGE THIS FILE, EITHER Sun Dec  8 12:24:14 PST 2013
+/* DO NOT SVN-MERGE THIS FILE, EITHER Tue Aug 12 17:02:39 MSK 2014
  *
  * You must regenerate it.  Use the ./genfixes script.
  *
@@ -15,7 +15,7 @@
  * certain ANSI-incompatible system header files which are fixed to work
  * correctly with ANSI C and placed in a directory that GNU C will search.
  *
- * This file contains 229 fixup descriptions.
+ * This file contains 228 fixup descriptions.
  *
  * See README for more information.
  *
@@ -2111,41 +2111,6 @@ int vfscanf(FILE *, const char *, __builtin_va_list) __asm__ (_BSD_STRING(__USER
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * *
  *
- *  Description of Complier_H_Tradcpp fix
- */
-tSCC zComplier_H_TradcppName[] =
-     "complier_h_tradcpp";
-
-/*
- *  File name selection pattern
- */
-tSCC zComplier_H_TradcppList[] =
-  "linux/compiler.h\0";
-/*
- *  Machine/OS name selection pattern
- */
-#define apzComplier_H_TradcppMachs (const char**)NULL
-
-/*
- *  content selection pattern - do fix if pattern found
- */
-tSCC zComplier_H_TradcppSelect0[] =
-       "#define __builtin_warning\\(x, y\\.\\.\\.\\) \\(1\\)";
-
-#define    COMPLIER_H_TRADCPP_TEST_CT  1
-static tTestDesc aComplier_H_TradcppTests[] = {
-  { TT_EGREP,    zComplier_H_TradcppSelect0, (regex_t*)NULL }, };
-
-/*
- *  Fix Command Arguments for Complier_H_Tradcpp
- */
-static const char* apzComplier_H_TradcppPatch[] = {
-    "format",
-    "/* __builtin_warning(x, y...) is obsolete */",
-    (char*)NULL };
-
-/* * * * * * * * * * * * * * * * * * * * * * * * * *
- *
  *  Description of Ctrl_Quotes_Def fix
  */
 tSCC zCtrl_Quotes_DefName[] =
@@ -9403,9 +9368,9 @@ static const char* apzX11_SprintfPatch[] = {
  *
  *  List of all fixes
  */
-#define REGEX_COUNT          268
+#define REGEX_COUNT          267
 #define MACH_LIST_SIZE_LIMIT 187
-#define FIX_COUNT            229
+#define FIX_COUNT            228
 
 /*
  *  Enumerate the fixes
@@ -9458,7 +9423,6 @@ typedef enum {
     BROKEN_CABS_FIXIDX,
     BROKEN_NAN_FIXIDX,
     BSD_STDIO_ATTRS_CONFLICT_FIXIDX,
-    COMPLIER_H_TRADCPP_FIXIDX,
     CTRL_QUOTES_DEF_FIXIDX,
     CTRL_QUOTES_USE_FIXIDX,
     CXX_UNREADY_FIXIDX,
@@ -9878,11 +9842,6 @@ tFixDesc fixDescList[ FIX_COUNT ] = {
      BSD_STDIO_ATTRS_CONFLICT_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
      aBsd_Stdio_Attrs_ConflictTests,   apzBsd_Stdio_Attrs_ConflictPatch, 0 },
 
-  {  zComplier_H_TradcppName,    zComplier_H_TradcppList,
-     apzComplier_H_TradcppMachs,
-     COMPLIER_H_TRADCPP_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
-     aComplier_H_TradcppTests,   apzComplier_H_TradcppPatch, 0 },
-
   {  zCtrl_Quotes_DefName,    zCtrl_Quotes_DefList,
      apzCtrl_Quotes_DefMachs,
      CTRL_QUOTES_DEF_TEST_CT, FD_MACH_ONLY | FD_SUBROUTINE,
diff --git a/fixincludes/inclhack.def b/fixincludes/inclhack.def
index 411300fb2e0..0a6fb80b1ad 100644
--- a/fixincludes/inclhack.def
+++ b/fixincludes/inclhack.def
@@ -1140,20 +1140,6 @@ fix = {
 };
 
 /*
- *  Old Linux kernel's <compiler.h> header breaks Traditional CPP
- */
-fix = {
-    hackname  = complier_h_tradcpp;
-    files     = linux/compiler.h;
-
-    select    = "#define __builtin_warning\\(x, y\\.\\.\\.\\) \\(1\\)";
-    c_fix     = format;
-    c_fix_arg = "/* __builtin_warning(x, y...) is obsolete */";
-
-    test_text = "#define __builtin_warning(x, y...) (1)";
-};
-
-/*
  *  Fix various macros used to define ioctl numbers.
  *  The traditional syntax was:
  *
diff --git a/fixincludes/tests/base/linux/compiler.h b/fixincludes/tests/base/linux/compiler.h
deleted file mode 100644
index 713527644bd..00000000000
--- a/fixincludes/tests/base/linux/compiler.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*  DO NOT EDIT THIS FILE.
-
-    It has been auto-edited by fixincludes from:
-
-	"fixinc/tests/inc/linux/compiler.h"
-
-    This had to be done to correct non-standard usages in the
-    original, manufacturer supplied header file.  */
-
-
-
-#if defined( COMPLIER_H_TRADCPP_CHECK )
-/* __builtin_warning(x, y...) is obsolete */
-#endif  /* COMPLIER_H_TRADCPP_CHECK */
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3d828baa47e..58192a9dab3 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,54 @@
+2015-03-26  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
+
+	Backport of r214242, r214254, and bug fix patches from mainline
+	* config/rs6000/rs6000.c (context.h): New #include.
+	(tree-pass.h): Likewise.
+	(make_pass_analyze_swaps): New declaration.
+	(rs6000_option_override): Register swap-optimization pass.
+	(swap_web_entry): New class.
+	(special_handling_values): New enum.
+	(union_defs): New function.
+	(union_uses): Likewise.
+	(insn_is_load_p): Likewise.
+	(insn_is_store_p): Likewise.
+	(insn_is_swap_p): Likewise.
+	(rtx_is_swappable_p): Likewise.
+	(insn_is_swappable_p): Likewise.
+	(chain_purpose): New enum.
+	(chain_contains_only_swaps): New function.
+	(mark_swaps_for_removal): Likewise.
+	(swap_const_vector_halves): Likewise.
+	(adjust_subreg_index): Likewise.
+	(permute_load): Likewise.
+	(permute_store): Likewise.
+	(adjust_extract): Likewise.
+	(adjust_splat): Likewise.
+	(handle_special_swappables): Likewise.
+	(replace_swap_with_copy): Likewise.
+	(dump_swap_insn_table): Likewise.
+	(rs6000_analyze_swaps): Likewise.
+	(pass_data_analyze_swaps): New pass_data.
+	(pass_analyze_swaps): New class.
+	(pass_analyze_swaps::gate): New method.
+	(pass_analyze_swaps::execute): New method.
+	(make_pass_analyze_swaps): New function.
+	* config/rs6000/rs6000.opt (moptimize-swaps): New option.
+	* df.h (web_entry_base): New class, replacing struct web_entry.
+	(web_entry_base::pred): New method.
+	(web_entry_base::set_pred): Likewise.
+	(web_entry_base::unionfind_root): Likewise.
+	(web_entry_base::unionfind_union): Likewise.
+	(unionfind_root): Delete external reference.
+	(unionfind_union): Likewise.
+	(union_defs): Likewise.
+	* web.c (web_entry_base::unionfind_root): Convert to method.
+	(web_entry_base::unionfind_union): Likewise.
+	(web_entry): New class.
+	(union_match_dups): Convert to use class structure.
+	(union_defs): Likewise.
+	(entry_register): Likewise.
+	(web_main): Likewise.
+
 2015-01-23  Jakub Jelinek  <jakub@redhat.com>
 
 	PR middle-end/64734
@@ -624,6 +675,12 @@
 	PR target/60111
 	* config/sh/sh.c: Use signed char for signed field.
 
+2014-11-22  Uros Bizjak  <ubizjak@gmail.com>
+
+       * params.def (PARAM_MAX_COMPLETELY_PEELED_INSNS): Increase to 200.
+       * config/i386/i386.c (ix86_option_override_internal): Do not increase
+       PARAM_MAX_COMPLETELY_PEELED_INSNS.
+
 2014-11-21  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
 
 	PR target/63673
diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c
index 31c7a568708..88115840c43 100644
--- a/gcc/auto-profile.c
+++ b/gcc/auto-profile.c
@@ -398,8 +398,6 @@ static unsigned
 get_combined_location (location_t loc, tree decl)
 {
   /* TODO: allow more bits for line and less bits for discriminator.  */
-  if (LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl) >= (1<<16))
-    warning_at (loc, OPT_Woverflow, "Offset exceeds 16 bytes.");
   return ((LOCATION_LINE (loc) - DECL_SOURCE_LINE (decl)) << 16)
          | get_discriminator_from_locus (loc);
 }
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index 6184f913e81..ec1785d8f4d 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -336,6 +336,7 @@ static tree handle_mode_attribute (tree *, tree, tree, int, bool *);
 static tree handle_section_attribute (tree *, tree, tree, int, bool *);
 static tree handle_aligned_attribute (tree *, tree, tree, int, bool *);
 static tree handle_weak_attribute (tree *, tree, tree, int, bool *) ;
+static tree handle_noplt_attribute (tree *, tree, tree, int, bool *) ;
 static tree handle_alias_ifunc_attribute (bool, tree *, tree, tree, bool *);
 static tree handle_ifunc_attribute (tree *, tree, tree, int, bool *);
 static tree handle_alias_attribute (tree *, tree, tree, int, bool *);
@@ -673,6 +674,8 @@ const struct attribute_spec c_common_attribute_table[] =
 			      handle_aligned_attribute, false },
   { "weak",                   0, 0, true,  false, false,
 			      handle_weak_attribute, false },
+  { "noplt",                   0, 0, true,  false, false,
+			      handle_noplt_attribute, false },
   { "ifunc",                  1, 1, true,  false, false,
 			      handle_ifunc_attribute, false },
   { "alias",                  1, 1, true,  false, false,
@@ -7668,6 +7671,25 @@ handle_weak_attribute (tree *node, tree name,
   return NULL_TREE;
 }
 
+/* Handle a "noplt" attribute; arguments as in
+   struct attribute_spec.handler.  */
+
+static tree
+handle_noplt_attribute (tree *node, tree name,
+		       tree ARG_UNUSED (args),
+		       int ARG_UNUSED (flags),
+		       bool * ARG_UNUSED (no_add_attrs))
+{
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+    {
+      warning (OPT_Wattributes,
+	       "%qE attribute is only applicable on functions", name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+  return NULL_TREE;
+}
+
 /* Handle an "alias" or "ifunc" attribute; arguments as in
    struct attribute_spec.handler, except that IS_ALIAS tells us
    whether this is an alias as opposed to ifunc attribute.  */
@@ -9178,7 +9200,7 @@ parse_optimize_options (tree args, bool attr_p)
 						&decoded_options_count);
   decode_options (&global_options, &global_options_set,
 		  decoded_options, decoded_options_count,
-		  input_location, global_dc);
+		  input_location, global_dc, false);
 
   targetm.override_options_after_change();
 
diff --git a/gcc/calls.c b/gcc/calls.c
index f0c92ddc018..5d7df8b6331 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -184,6 +184,18 @@ prepare_call_address (tree fndecl, rtx funexp, rtx static_chain_value,
 	       && targetm.small_register_classes_for_mode_p (FUNCTION_MODE))
 	      ? force_not_mem (memory_address (FUNCTION_MODE, funexp))
 	      : memory_address (FUNCTION_MODE, funexp));
+  else if (flag_pic
+	   && fndecl
+	   && TREE_CODE (fndecl) == FUNCTION_DECL
+	   && (!flag_plt
+	       || lookup_attribute ("noplt", DECL_ATTRIBUTES (fndecl)))
+	   && !targetm.binds_local_p (fndecl))
+    {
+      /* This is done only for PIC code.  There is no easy interface to force the
+	 function address into GOT for non-PIC case.  non-PIC case needs to be
+	 handled specially by the backend.  */
+      funexp = force_reg (Pmode, funexp);
+    }
   else if (! sibcallp)
     {
 #ifndef NO_FUNCTION_CSE
diff --git a/gcc/cfgrtl.c b/gcc/cfgrtl.c
index 3ae6ce98e64..ed1c37cd4bd 100644
--- a/gcc/cfgrtl.c
+++ b/gcc/cfgrtl.c
@@ -1461,7 +1461,7 @@ emit_barrier_after_bb (basic_block bb)
           rtx footer_tail = BB_FOOTER (bb);
 
           while (NEXT_INSN (footer_tail))
-            footer_tail = NEXT_INSN (insn);
+            footer_tail = NEXT_INSN (footer_tail);
           if (!BARRIER_P (footer_tail))
             {
               NEXT_INSN (footer_tail) = insn;
diff --git a/gcc/common.opt b/gcc/common.opt
index b95edad93af..c7151ba3aeb 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1217,6 +1217,11 @@ fdwarf2-cfi-asm
 Common Report Var(flag_dwarf2_cfi_asm) Init(HAVE_GAS_CFI_DIRECTIVE)
 Enable CFI tables via GAS assembler directives.
 
+ftwo-level-all-subprogs
+Common Report Var(flag_two_level_all_subprogs) Init(0)
+When generating two-level line tables in DWARF (experimental),
+add linkage names for all functions (not just inlined functions).
+
 ftwo-level-line-tables
 Common Report Var(flag_two_level_line_tables) Init(0)
 Use two-level line tables in DWARF (experimental).
@@ -1810,6 +1815,10 @@ fpie
 Common Report Var(flag_pie,1) Negative(fPIC)
 Generate position-independent code for executables if possible (small mode)
 
+fplt
+Common Report Var(flag_plt) Init(1) Optimization
+Use PLT for PIC calls (-fno-plt: load the address from GOT at call site)
+
 fplugin=
 Common Joined RejectNegative Var(common_deferred_options) Defer
 Specify a plugin to load
diff --git a/gcc/common/config/mips/mips-common.c b/gcc/common/config/mips/mips-common.c
index 7dd8d2d56a8..a140d559473 100644
--- a/gcc/common/config/mips/mips-common.c
+++ b/gcc/common/config/mips/mips-common.c
@@ -42,6 +42,15 @@ mips_handle_option (struct gcc_options *opts,
       opts->x_mips_cache_flush_func = NULL;
       return true;
 
+    case OPT_mfp32:
+    case OPT_mfp64:
+      opts->x_target_flags &= ~MASK_FLOATXX;
+      return true;
+
+    case OPT_mfpxx:
+      opts->x_target_flags &= ~MASK_FLOAT64;
+      return true;
+
     default:
       return true;
     }
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 235a9c2e5a1..c0fdd2c116d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -422,7 +422,7 @@ microblaze*-*-*)
 mips*-*-*)
 	cpu_type=mips
 	need_64bit_hwint=yes
-	extra_headers="loongson.h"
+	extra_headers="loongson.h msa.h"
 	extra_options="${extra_options} g.opt mips/mips-tables.opt"
 	;;
 nds32*)
@@ -1977,70 +1977,90 @@ mips*-*-netbsd*)			# NetBSD/mips, either endian.
 	tm_file="elfos.h ${tm_file} mips/elf.h netbsd.h netbsd-elf.h mips/netbsd.h"
 	extra_options="${extra_options} netbsd.opt netbsd-elf.opt"
 	;;
+mips*-img-linux*)
+	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h mips/mti-linux.h"
+	extra_options="${extra_options} linux-android.opt"
+	tmake_file="${tmake_file} mips/t-img-linux"
+	tm_defines="${tm_defines} MIPS_ISA_DEFAULT=37 MIPS_ABI_DEFAULT=ABI_32"
+	gnu_ld=yes
+	gas=yes
+	;;
 mips*-mti-linux*)
-	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/gnu-user64.h mips/linux64.h mips/linux-common.h mips/mti-linux.h"
+	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h mips/mti-linux.h"
 	extra_options="${extra_options} linux-android.opt"
 	tmake_file="${tmake_file} mips/t-mti-linux"
 	tm_defines="${tm_defines} MIPS_ISA_DEFAULT=33 MIPS_ABI_DEFAULT=ABI_32"
 	gnu_ld=yes
 	gas=yes
 	;;
-mips64*-*-linux* | mipsisa64*-*-linux*)
-	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/gnu-user64.h mips/linux64.h mips/linux-common.h"
+mips*-*-linux*)				# Linux MIPS, either endian.
+	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h mips/linux-common.h"
 	extra_options="${extra_options} linux-android.opt"
-	tmake_file="${tmake_file} mips/t-linux64"
 	case ${target} in
-		*android*)
-			# Default to ABI_64 for MIPS64 Android
-			tm_defines="${tm_defines} MIPS_ABI_DEFAULT=ABI_64"
+		mips64*android*)
+			default_mips_arch=mips64r6
+			default_mips_abi=64
+			tm_file="${tm_file} mips/android.h"
+			tmake_file="${tmake_file} mips/t-linux-android64"
 			;;
-		*)
-			tm_defines="${tm_defines} MIPS_ABI_DEFAULT=ABI_N32"
+		mips*android*)
+			default_mips_arch=mips32
+			tm_file="${tm_file} mips/android.h"
+			tmake_file="$tmake_file mips/t-linux-android"
+			;;
+		mipsisa32r6*)
+			default_mips_arch=mips32r6
+			;;
+		mipsisa32r2*)
+			default_mips_arch=mips32r2
+			;;
+		mipsisa32*)
+			default_mips_arch=mips32
 			;;
-	esac
-	case ${target} in
 		mips64el-st-linux-gnu)
+			default_mips_abi=n32
 			tm_file="${tm_file} mips/st.h"
 			tmake_file="${tmake_file} mips/t-st"
+			enable_mips_multilibs="yes"
 			;;
 		mips64octeon*-*-linux*)
+			default_mips_abi=n32
 			tm_defines="${tm_defines} MIPS_CPU_STRING_DEFAULT=\\\"octeon\\\""
 			target_cpu_default=MASK_SOFT_FLOAT_ABI
+			enable_mips_multilibs="yes"
+			;;
+		mipsisa64r6*-*-linux*)
+			default_mips_abi=n32
+			default_mips_arch=mips64r6
+			enable_mips_multilibs="yes"
 			;;
 		mipsisa64r2*-*-linux*)
-			tm_defines="${tm_defines} MIPS_ISA_DEFAULT=65"
+			default_mips_abi=n32
+			default_mips_arch=mips64r2
+			enable_mips_multilibs="yes"
+			;;
+		mips64*-*-linux* | mipsisa64*-*-linux*)
+			default_mips_abi=n32
+			enable_mips_multilibs="yes"
 			;;
 	esac
-	gnu_ld=yes
-	gas=yes
-	;;
-mips*-*-linux*)				# Linux MIPS, either endian.
-	tm_file="dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file} mips/gnu-user.h mips/linux.h"
-	extra_options="${extra_options} linux-android.opt"
 	if test x$enable_targets = xall; then
-		tm_file="${tm_file} mips/gnu-user64.h mips/linux64.h"
+		enable_mips_multilibs="yes"
+	fi
+	if test x$enable_mips_multilibs = xyes; then
 		tmake_file="${tmake_file} mips/t-linux64"
 	fi
-	tm_file="${tm_file} mips/linux-common.h"
-	case ${target} in
-        mipsisa32r2*)
-		tm_defines="${tm_defines} MIPS_ISA_DEFAULT=33"
-                ;;
-        mipsisa32*)
-		tm_defines="${tm_defines} MIPS_ISA_DEFAULT=32"
-        esac
-	case ${target} in
-	*android*)
-		# Default to little-endian for MIPS Android
-		# tm_defines="${tm_defines} TARGET_ENDIAN_DEFAULT=0"
-		tmake_file="$tmake_file mips/t-linux-android"
-        esac
-	;;
+ 	;;
 mips*-mti-elf*)
 	tm_file="elfos.h newlib-stdint.h ${tm_file} mips/elf.h mips/n32-elf.h mips/sde.h mips/mti-elf.h"
 	tmake_file="mips/t-mti-elf"
 	tm_defines="${tm_defines} MIPS_ISA_DEFAULT=33 MIPS_ABI_DEFAULT=ABI_32"
 	;;
+mips*-img-elf*)
+	tm_file="elfos.h newlib-stdint.h ${tm_file} mips/elf.h mips/n32-elf.h mips/sde.h mips/mti-elf.h"
+	tmake_file="mips/t-img-elf"
+	tm_defines="${tm_defines} MIPS_ISA_DEFAULT=37 MIPS_ABI_DEFAULT=ABI_32"
+	;;
 mips*-sde-elf*)
 	tm_file="elfos.h newlib-stdint.h ${tm_file} mips/elf.h mips/n32-elf.h mips/sde.h"
 	tmake_file="mips/t-sde"
@@ -2061,12 +2081,18 @@ mips*-sde-elf*)
 	    ;;
 	esac
 	case ${target} in
+	  mipsisa32r6*)
+	    tm_defines="MIPS_ISA_DEFAULT=37 MIPS_ABI_DEFAULT=ABI_32"
+	    ;;
 	  mipsisa32r2*)
 	    tm_defines="MIPS_ISA_DEFAULT=33 MIPS_ABI_DEFAULT=ABI_32"
 	    ;;
 	  mipsisa32*)
 	    tm_defines="MIPS_ISA_DEFAULT=32 MIPS_ABI_DEFAULT=ABI_32"
 	    ;;
+	  mipsisa64r6*)
+	    tm_defines="MIPS_ISA_DEFAULT=69 MIPS_ABI_DEFAULT=ABI_N32"
+	    ;;
 	  mipsisa64r2*)
 	    tm_defines="MIPS_ISA_DEFAULT=65 MIPS_ABI_DEFAULT=ABI_N32"
 	    ;;
@@ -2077,17 +2103,25 @@ mips*-sde-elf*)
 	;;
 mipsisa32-*-elf* | mipsisa32el-*-elf* | \
 mipsisa32r2-*-elf* | mipsisa32r2el-*-elf* | \
+mipsisa32r6-*-elf* | mipsisa32r6el-*-elf* | \
 mipsisa64-*-elf* | mipsisa64el-*-elf* | \
-mipsisa64r2-*-elf* | mipsisa64r2el-*-elf*)
+mipsisa64r2-*-elf* | mipsisa64r2el-*-elf* | \
+mipsisa64r6-*-elf* | mipsisa64r6el-*-elf*)
 	tm_file="elfos.h newlib-stdint.h ${tm_file} mips/elf.h"
 	tmake_file="mips/t-isa3264"
 	case ${target} in
+	  mipsisa32r6*)
+	    tm_defines="${tm_defines} MIPS_ISA_DEFAULT=37"
+	    ;;
 	  mipsisa32r2*)
 	    tm_defines="${tm_defines} MIPS_ISA_DEFAULT=33"
 	    ;;
 	  mipsisa32*)
 	    tm_defines="${tm_defines} MIPS_ISA_DEFAULT=32"
 	    ;;
+	  mipsisa64r6*)
+	    tm_defines="${tm_defines} MIPS_ISA_DEFAULT=69"
+	    ;;
 	  mipsisa64r2*)
 	    tm_defines="${tm_defines} MIPS_ISA_DEFAULT=65"
 	    ;;
@@ -3799,7 +3833,7 @@ case "${target}" in
 		;;
 
 	mips*-*-*)
-		supported_defaults="abi arch arch_32 arch_64 float fpu nan tune tune_32 tune_64 divide llsc mips-plt synci"
+		supported_defaults="abi arch arch_32 arch_64 float fpu nan fp_32 odd_spreg_32 tune tune_32 tune_64 divide llsc mips-plt synci"
 
 		case ${with_float} in
 		"" | soft | hard)
@@ -3831,6 +3865,32 @@ case "${target}" in
 			;;
 		esac
 
+		case ${with_fp_32} in
+		"" | 32 | xx | 64)
+			# OK
+			;;
+		*)
+			echo "Unknown FP mode used in --with-fp-32=$with_fp_32" 1>&2
+			exit 1
+			;;
+		esac
+
+		case ${with_odd_spreg_32} in
+		yes)
+			with_odd_spreg_32="odd-spreg"
+			;;
+		no)
+			with_odd_spreg_32="no-odd-spreg"
+			;;
+		"")
+			# OK
+			;;
+		*)
+			echo "Unknown odd-spreg-32 type used in --with-odd-spreg-32=$with_odd_spreg_32" 1>&2
+			exit 1
+			;;
+		esac
+
 		case ${with_abi} in
 		"" | 32 | o64 | n32 | 64 | eabi)
 			# OK
@@ -4171,6 +4231,31 @@ case ${target} in
 				tm_defines="TARGET_ENDIAN_DEFAULT=0 $tm_defines"
 				;;
 		esac
+		if test x$with_arch != x; then
+			default_mips_arch=$with_arch
+		fi
+		if test x$with_abi != x; then
+			default_mips_abi=$with_abi
+		fi
+		case ${default_mips_arch} in
+		    mips1)    tm_defines="$tm_defines MIPS_ISA_DEFAULT=1" ;;
+		    mips2)    tm_defines="$tm_defines MIPS_ISA_DEFAULT=2" ;;
+		    mips3)    tm_defines="$tm_defines MIPS_ISA_DEFAULT=3" ;;
+		    mips4)    tm_defines="$tm_defines MIPS_ISA_DEFAULT=4" ;;
+		    mips32)   tm_defines="$tm_defines MIPS_ISA_DEFAULT=32" ;;
+		    mips32r2) tm_defines="$tm_defines MIPS_ISA_DEFAULT=33" ;;
+		    mips32r6) tm_defines="$tm_defines MIPS_ISA_DEFAULT=37" ;;
+		    mips64)   tm_defines="$tm_defines MIPS_ISA_DEFAULT=64" ;;
+		    mips64r2) tm_defines="$tm_defines MIPS_ISA_DEFAULT=65" ;;
+		    mips64r6) tm_defines="$tm_defines MIPS_ISA_DEFAULT=69" ;;
+		esac
+		case ${default_mips_abi} in
+		    32)   tm_defines="$tm_defines MIPS_ABI_DEFAULT=ABI_32" ;;
+		    o64)  tm_defines="$tm_defines MIPS_ABI_DEFAULT=ABI_O64" ;;
+		    n32)  tm_defines="$tm_defines MIPS_ABI_DEFAULT=ABI_N32" ;;
+		    64)   tm_defines="$tm_defines MIPS_ABI_DEFAULT=ABI_64" ;;
+		    eabi) tm_defines="$tm_defines MIPS_ABI_DEFAULT=ABI_EABI" ;;
+		esac
 		tmake_file="mips/t-mips $tmake_file"
 		;;
 
@@ -4233,7 +4318,7 @@ case ${target} in
 esac
 
 t=
-all_defaults="abi cpu cpu_32 cpu_64 arch arch_32 arch_64 tune tune_32 tune_64 schedule float mode fpu nan divide llsc mips-plt synci tls"
+all_defaults="abi cpu cpu_32 cpu_64 arch arch_32 arch_64 tune tune_32 tune_64 schedule float mode fpu nan fp_32 odd_spreg_32 divide llsc mips-plt synci tls"
 for option in $all_defaults
 do
 	eval "val=\$with_"`echo $option | sed s/-/_/g`
diff --git a/gcc/config.in b/gcc/config.in
index 90b1dbfe6c2..83c51ce68fc 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -459,6 +459,12 @@
 #endif
 
 
+/* Define if the assembler understands .module. */
+#ifndef USED_FOR_TARGET
+#undef HAVE_AS_DOT_MODULE
+#endif
+
+
 /* Define if your assembler supports the -no-mul-bug-abort option. */
 #ifndef USED_FOR_TARGET
 #undef HAVE_AS_NO_MUL_BUG_ABORT_OPTION
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index 5b61cdc91dd..b77becd2359 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -25,10 +25,12 @@
 #define RUNTIME_ROOT_PREFIX ""
 #endif
 #define GLIBC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/lib/ld-linux-aarch64%{mbig-endian:_be}.so.1"
+#define BIONIC_DYNAMIC_LINKER RUNTIME_ROOT_PREFIX "/system/bin/linker64"
+
 
 #define CPP_SPEC "%{pthread:-D_REENTRANT}"
 
-#define LINUX_TARGET_LINK_SPEC  "%{h*}		\
+#define LINUX_TARGET_LINK_SPEC0  "%{h*}		\
    %{static:-Bstatic}				\
    %{shared:-shared}				\
    %{symbolic:-Bsymbolic}			\
@@ -46,8 +48,9 @@
   " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
 #endif
 
-#define LINK_SPEC LINUX_TARGET_LINK_SPEC \
-                  CA53_ERR_835769_SPEC
+#define LINUX_TARGET_LINK_SPEC LINUX_TARGET_LINK_SPEC0 CA53_ERR_835769_SPEC
+
+#define LINK_SPEC LINUX_TARGET_LINK_SPEC
 
 #define TARGET_OS_CPP_BUILTINS()		\
   do						\
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f5b6a867bb3..05f5e1b351d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2786,7 +2786,7 @@
 
 ;; Logical right shift using SISD or Integer instruction
 (define_insn "*aarch64_lshr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=w,w,r")
+  [(set (match_operand:GPI 0 "register_operand" "=w,&w,r")
         (lshiftrt:GPI
           (match_operand:GPI 1 "register_operand" "w,w,r")
           (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,w,rUs<cmode>")))]
@@ -2805,11 +2805,13 @@
            (match_operand:DI 1 "aarch64_simd_register")
            (match_operand:QI 2 "aarch64_simd_register")))]
   "TARGET_SIMD && reload_completed"
-  [(set (match_dup 2)
+  [(set (match_dup 3)
         (unspec:QI [(match_dup 2)] UNSPEC_SISD_NEG))
    (set (match_dup 0)
-        (unspec:DI [(match_dup 1) (match_dup 2)] UNSPEC_SISD_USHL))]
-  ""
+        (unspec:DI [(match_dup 1) (match_dup 3)] UNSPEC_SISD_USHL))]
+  {
+    operands[3] = gen_lowpart (QImode, operands[0]);
+  }
 )
 
 (define_split
@@ -2818,11 +2820,13 @@
            (match_operand:SI 1 "aarch64_simd_register")
            (match_operand:QI 2 "aarch64_simd_register")))]
   "TARGET_SIMD && reload_completed"
-  [(set (match_dup 2)
+  [(set (match_dup 3)
         (unspec:QI [(match_dup 2)] UNSPEC_SISD_NEG))
    (set (match_dup 0)
-        (unspec:SI [(match_dup 1) (match_dup 2)] UNSPEC_USHL_2S))]
-  ""
+        (unspec:SI [(match_dup 1) (match_dup 3)] UNSPEC_USHL_2S))]
+  {
+    operands[3] = gen_lowpart (QImode, operands[0]);
+  }
 )
 
 ;; Arithmetic right shift using SISD or Integer instruction
diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
index d5c7908beb0..19ae3665a21 100644
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@@ -9918,12 +9918,6 @@ alpha_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
 #undef TARGET_EXPAND_BUILTIN_VA_START
 #define TARGET_EXPAND_BUILTIN_VA_START alpha_va_start
 
-/* The Alpha architecture does not require sequential consistency.  See
-   http://www.cs.umd.edu/~pugh/java/memoryModel/AlphaReordering.html
-   for an example of how it can be violated in practice.  */
-#undef TARGET_RELAXED_ORDERING
-#define TARGET_RELAXED_ORDERING true
-
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE alpha_option_override
 
diff --git a/gcc/config/i386/arm_neon.h b/gcc/config/i386/arm_neon.h
index 5f56cbdae9c..869215199a4 100644
--- a/gcc/config/i386/arm_neon.h
+++ b/gcc/config/i386/arm_neon.h
@@ -30,59 +30,62 @@
 //*****************************************************************************************
 
 //!!!!!!!  To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
-//!!!!!!!  Please pay attention at #define USE_SSSE3 and USE_SSE4 below - you need to define them for newest Intel platforms for
-//!!!!!!!  greater performance. It can be done by -mssse3 or -msse4.2 (which also implies -mssse3) compiler switch.
+//!!!!!!!  Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
+//!!!!!!!  greater performance. It can be done by -msse4.2 compiler switch.
 
 #ifndef NEON2SSE_H
 #define NEON2SSE_H
 
 #ifndef USE_SSE4
-    #if defined(__SSE4_2__)
-        #define USE_SSE4
-        #define USE_SSSE3
-    #endif
+#if defined(__SSE4_2__)
+    #define USE_SSE4
 #endif
-
-#ifndef USE_SSSE3
-    #if defined(__SSSE3__)
-        #define USE_SSSE3
-    #endif
 #endif
 
 #include <xmmintrin.h>     //SSE
 #include <emmintrin.h>     //SSE2
 #include <pmmintrin.h>     //SSE3
+#include <tmmintrin.h>     //SSSE3
+#ifdef USE_SSE4
+#include <smmintrin.h> //SSE4.1
+#include <nmmintrin.h> //SSE4.2
+#endif
+
 
-#ifdef USE_SSSE3
-    #include <tmmintrin.h>     //SSSE3
+//***************  functions and data attributes, compiler dependent  *********************************
+//***********************************************************************************
+#ifdef __GNUC__
+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#if _GCC_VERSION <  40500
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
+#else
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
+#endif
+#if defined(__x86_64__)
+    #define _NEON2SSE_64BIT  __x86_64__
+#endif
+#else
+#define _NEON2SSE_ALIGN_16  __declspec(align(16))
+#define _NEON2SSE_INLINE __inline
+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
+#if defined(_M_X64)
+        #define _NEON2SSE_64BIT  _M_X64
+#endif
 #else
-# warning "Some functions require SSSE3 or higher."
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#endif
 #endif
 
-#ifdef USE_SSE4
-    #include <smmintrin.h>     //SSE4.1
-    #include <nmmintrin.h>     //SSE4.2
+#if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
+    #define _NEON2SSE_64BIT_SSE4
 #endif
 
 /*********************************************************************************************************************/
 //    data types conversion
 /*********************************************************************************************************************/
-
-typedef __m128 float32x4_t;
-
-typedef __m128 float16x8_t;         //not supported by IA, for compartibility
-
-typedef __m128i int8x16_t;
-typedef __m128i int16x8_t;
-typedef __m128i int32x4_t;
-typedef __m128i int64x2_t;
-typedef __m128i uint8x16_t;
-typedef __m128i uint16x8_t;
-typedef __m128i uint32x4_t;
-typedef __m128i uint64x2_t;
-typedef __m128i poly8x16_t;
-typedef __m128i poly16x8_t;
-
 #if defined(_MSC_VER) && (_MSC_VER < 1300)
     typedef signed char int8_t;
     typedef unsigned char uint8_t;
@@ -100,18 +103,59 @@ typedef __m128i poly16x8_t;
     typedef signed __int32 int32_t;
     typedef unsigned __int32 uint32_t;
 
-typedef signed long long int64_t;
-typedef unsigned long long uint64_t;
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
 #else
-    #include <stdint.h>
-    #include <limits.h>
+#include <stdint.h>
+#include <limits.h>
 #endif
+
+typedef union   __m64_128 {
+    uint64_t m64_u64[1];
+    float m64_f32[2];
+    int8_t m64_i8[8];
+    int16_t m64_i16[4];
+    int32_t m64_i32[2];
+    int64_t m64_i64[1];
+    uint8_t m64_u8[8];
+    uint16_t m64_u16[4];
+    uint32_t m64_u32[2];
+} __m64_128;
+
+typedef __m64_128 int8x8_t;
+typedef __m64_128 uint8x8_t;
+typedef __m64_128 int16x4_t;
+typedef __m64_128 uint16x4_t;
+typedef __m64_128 int32x2_t;
+typedef __m64_128 uint32x2_t;
+typedef __m64_128 int64x1_t;
+typedef __m64_128 uint64x1_t;
+typedef __m64_128 poly8x8_t;
+typedef __m64_128 poly16x4_t;
+
+typedef __m64_128 float32x2_t;
+typedef __m128 float32x4_t;
+
+typedef __m128 float16x4_t; //not supported by IA, for compatibility
+typedef __m128 float16x8_t; //not supported by IA, for compatibility
+
+typedef __m128i int8x16_t;
+typedef __m128i int16x8_t;
+typedef __m128i int32x4_t;
+typedef __m128i int64x2_t;
+typedef __m128i uint8x16_t;
+typedef __m128i uint16x8_t;
+typedef __m128i uint32x4_t;
+typedef __m128i uint64x2_t;
+typedef __m128i poly8x16_t;
+typedef __m128i poly16x8_t;
+
 #if defined(_MSC_VER)
-#define SINT_MIN     (-2147483647 - 1)    /* min signed int value */
-#define SINT_MAX       2147483647         /* max signed int value */
+    #define SINT_MIN     (-2147483647 - 1) /* min signed int value */
+    #define SINT_MAX       2147483647 /* max signed int value */
 #else
-#define SINT_MIN     INT_MIN              /* min signed int value */
-#define SINT_MAX     INT_MAX              /* max signed int value */
+    #define SINT_MIN     INT_MIN /* min signed int value */
+    #define SINT_MAX     INT_MAX /* max signed int value */
 #endif
 
 typedef   float float32_t;
@@ -120,10 +164,9 @@ typedef   float __fp16;
 typedef  uint8_t poly8_t;
 typedef  uint16_t poly16_t;
 
+
 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
-
-//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
 struct int8x16x2_t {
     int8x16_t val[2];
 };
@@ -136,16 +179,29 @@ struct int32x4x2_t {
 struct int64x2x2_t {
     int64x2_t val[2];
 };
+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
+struct int8x8x2_t {
+    int8x8_t val[2];
+};
+struct int16x4x2_t {
+    int16x4_t val[2];
+};
+struct int32x2x2_t {
+    int32x2_t val[2];
+};
+struct int64x1x2_t {
+    int64x1_t val[2];
+};
+
+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
 
-typedef struct int8x16x2_t int8x16x2_t;         //for C compilers to make them happy
-typedef struct int16x8x2_t int16x8x2_t;         //for C compilers to make them happy
-typedef struct int32x4x2_t int32x4x2_t;         //for C compilers to make them happy
-typedef struct int64x2x2_t int64x2x2_t;         //for C compilers to make them happy
-//to avoid pointers conversion
-typedef  int8x16x2_t int8x8x2_t;
-typedef  int16x8x2_t int16x4x2_t;
-typedef  int32x4x2_t int32x2x2_t;
-typedef  int64x2x2_t int64x1x2_t;
+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
 typedef struct int8x16x2_t uint8x16x2_t;
@@ -155,12 +211,12 @@ typedef struct int64x2x2_t uint64x2x2_t;
 typedef struct int8x16x2_t poly8x16x2_t;
 typedef struct int16x8x2_t poly16x8x2_t;
 
-typedef  int8x8x2_t uint8x8x2_t;
-typedef  int16x4x2_t uint16x4x2_t;
-typedef  int32x2x2_t uint32x2x2_t;
-typedef  int64x1x2_t uint64x1x2_t;
-typedef  int8x8x2_t poly8x8x2_t;
-typedef  int16x4x2_t poly16x4x2_t;
+typedef struct int8x8x2_t uint8x8x2_t;
+typedef struct int16x4x2_t uint16x4x2_t;
+typedef struct int32x2x2_t uint32x2x2_t;
+typedef struct int64x1x2_t uint64x1x2_t;
+typedef struct int8x8x2_t poly8x8x2_t;
+typedef struct int16x4x2_t poly16x4x2_t;
 
 //float
 struct float32x4x2_t {
@@ -169,9 +225,13 @@ struct float32x4x2_t {
 struct float16x8x2_t {
     float16x8_t val[2];
 };
-typedef struct float32x4x2_t float32x4x2_t;         //for C compilers to make them happy
-typedef struct float16x8x2_t float16x8x2_t;         //for C compilers to make them happy
-typedef  float32x4x2_t float32x2x2_t;
+struct float32x2x2_t {
+    float32x2_t val[2];
+};
+
+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
+typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
 typedef  float16x8x2_t float16x4x2_t;
 
 //4
@@ -188,22 +248,36 @@ struct int64x2x4_t {
     int64x2_t val[4];
 };
 
-typedef struct int8x16x4_t int8x16x4_t;         //for C compilers to make them happy
-typedef struct int16x8x4_t int16x8x4_t;         //for C compilers to make them happy
-typedef struct int32x4x4_t int32x4x4_t;         //for C compilers to make them happy
-typedef struct int64x2x4_t int64x2x4_t;         //for C compilers to make them happy
-typedef  int8x16x4_t int8x8x4_t;
-typedef  int16x8x4_t int16x4x4_t;
-typedef  int32x4x4_t int32x2x4_t;
-typedef  int64x2x4_t int64x1x4_t;
+struct int8x8x4_t {
+    int8x8_t val[4];
+};
+struct int16x4x4_t {
+    int16x4_t val[4];
+};
+struct int32x2x4_t {
+    int32x2_t val[4];
+};
+struct int64x1x4_t {
+    int64x1_t val[4];
+};
+
+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
+
+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-typedef int8x8x4_t uint8x8x4_t;
-typedef int16x4x4_t uint16x4x4_t;
-typedef int32x2x4_t uint32x2x4_t;
-typedef int64x1x4_t uint64x1x4_t;
-typedef uint8x8x4_t poly8x8x4_t;
-typedef uint16x4x4_t poly16x4x4_t;
+typedef struct int8x8x4_t uint8x8x4_t;
+typedef struct int16x4x4_t uint16x4x4_t;
+typedef struct int32x2x4_t uint32x2x4_t;
+typedef struct int64x1x4_t uint64x1x4_t;
+typedef struct int8x8x4_t poly8x8x4_t;
+typedef struct int16x4x4_t poly16x4x4_t;
 
 typedef struct int8x16x4_t uint8x16x4_t;
 typedef struct int16x8x4_t uint16x8x4_t;
@@ -218,10 +292,13 @@ struct float32x4x4_t {
 struct float16x8x4_t {
     float16x8_t val[4];
 };
+struct float32x2x4_t {
+    float32x2_t val[4];
+};
 
-typedef struct float32x4x4_t float32x4x4_t;         //for C compilers to make them happy
-typedef struct float16x8x4_t float16x8x4_t;         //for C compilers to make them happy
-typedef  float32x4x4_t float32x2x4_t;
+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
+typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
 typedef  float16x8x4_t float16x4x4_t;
 
 //3
@@ -238,14 +315,28 @@ struct int8x16x3_t {
     int8x16_t val[3];
 };
 
-typedef struct int16x8x3_t int16x8x3_t;         //for C compilers to make them happy
-typedef struct int32x4x3_t int32x4x3_t;         //for C compilers to make them happy
-typedef struct int64x2x3_t int64x2x3_t;         //for C compilers to make them happy
-typedef struct int8x16x3_t int8x16x3_t;         //for C compilers to make them happy
-typedef  int16x8x3_t int16x4x3_t;
-typedef  int32x4x3_t int32x2x3_t;
-typedef  int64x2x3_t int64x1x3_t;
-typedef  int8x16x3_t int8x8x3_t;
+struct int16x4x3_t {
+    int16x4_t val[3];
+};
+struct int32x2x3_t {
+    int32x2_t val[3];
+};
+struct int64x1x3_t {
+    int64x1_t val[3];
+};
+struct int8x8x3_t {
+    int8x8_t val[3];
+};
+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
+
+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
+
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
 typedef struct int8x16x3_t uint8x16x3_t;
@@ -254,33 +345,55 @@ typedef struct int32x4x3_t uint32x4x3_t;
 typedef struct int64x2x3_t uint64x2x3_t;
 typedef struct int8x16x3_t poly8x16x3_t;
 typedef struct int16x8x3_t poly16x8x3_t;
-typedef int8x8x3_t uint8x8x3_t;
-typedef int16x4x3_t uint16x4x3_t;
-typedef int32x2x3_t uint32x2x3_t;
-typedef int64x1x3_t uint64x1x3_t;
-typedef int8x8x3_t poly8x8x3_t;
-typedef int16x4x3_t poly16x4x3_t;
+typedef struct  int8x8x3_t uint8x8x3_t;
+typedef struct  int16x4x3_t uint16x4x3_t;
+typedef struct  int32x2x3_t uint32x2x3_t;
+typedef struct  int64x1x3_t uint64x1x3_t;
+typedef struct  int8x8x3_t poly8x8x3_t;
+typedef struct  int16x4x3_t poly16x4x3_t;
 
 //float
 struct float32x4x3_t {
     float32x4_t val[3];
 };
+struct float32x2x3_t {
+    float32x2_t val[3];
+};
 struct float16x8x3_t {
     float16x8_t val[3];
 };
 
-typedef struct float32x4x3_t float32x4x3_t;         //for C compilers to make them happy
-typedef struct float16x8x3_t float16x8x3_t;         //for C compilers to make them happy
-typedef  float32x4x3_t float32x2x3_t;
+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
 typedef  float16x8x3_t float16x4x3_t;
 
+
 //****************************************************************************
 //****** Porting auxiliary macros ********************************************
-#define _M128i(a) (*(__m128i*)&(a))
-#define _M128d(a) (*(__m128d*)&(a))
-#define _M128(a) (*(__m128*)&(a))
+
+//** floating point related macros **
+#define _M128i(a) _mm_castps_si128(a)
+#define _M128(a) _mm_castsi128_ps(a)
+//here the most performance effective implementation is compiler and 32/64 bits build dependent
+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
+
+        #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
+        #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
+        #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
+#else
+   //for 32bit gcc and Microsoft compilers builds
+    #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
+    #define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
+    #define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
+#endif
+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
+
+#define return64(a)  _M64(res64,a); return res64;
+#define return64f(a)  _M64f(res64,a); return res64;
+
 #define _Ui64(a) (*(uint64_t*)&(a))
-#define _UNSIGNED_T(a) u##a
+#define _UNSIGNED_T(a) u ## a
 
 #define _SIGNBIT64 ((uint64_t)1 << 63)
 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
@@ -289,1145 +402,1854 @@ typedef  float16x8x3_t float16x4x3_t;
 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
 
-//***************  functions attributes  ********************************************
-//***********************************************************************************
-#ifdef __GNUC__
-    #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-    #define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
-    #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    #if _GCC_VERSION <  40500
-        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
-    #else
-        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
-    #endif
-#elif defined(_MSC_VER)|| defined (__INTEL_COMPILER)
-    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
-    #define _NEON2SSE_INLINE __inline
-    #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
-#else
-    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
-    #define _NEON2SSE_INLINE inline
-    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
-#endif
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #define __constrange(min,max)  const
 #define __transfersize(size)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+
 //*************************************************************************
 //*************************************************************************
 //*********  Functions declarations as declared in original arm_neon.h *****
 //*************************************************************************
 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
-
-int8x16_t vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
-int16x8_t vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
-int32x4_t vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
-int64x2_t vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
-float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
-uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
-uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
-uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
-uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-
+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
-
-int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
-int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S16 q0,q0,q0
-int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
-uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
-uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.U16 q0,q0,q0
-uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
-
-int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
-int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
-int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
-uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
-uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.U16 q0,q0,q0
-uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
-
-int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
-int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
-int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
-int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
-uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
-uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.U16 q0,q0,q0
-uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
-uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
-
+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
 //Vector rounding add high half: vraddhn
-
+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
 //Multiplication
 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-
-int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
-int16x8_t vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
-int32x4_t vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
-float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
-uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
-uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
-uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
-poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+//multiply lane
+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-
-int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
-int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
-int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
-float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
-uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
-uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
-uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-
+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
-
-int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
-int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
-int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
-float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
-uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
-uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
-uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
 //Vector multiply subtract long
-
+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
 //Vector saturating doubling multiply high
-
-int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
-int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
 //Vector saturating rounding doubling multiply high
-
-int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
-int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
 //Vector saturating doubling multiply accumulate long
-
+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
 //Vector saturating doubling multiply subtract long
-
+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
 //Vector long multiply
-
+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
 //Vector saturating doubling long multiply
-
+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
 //Subtraction
 //Vector subtract
-
-int8x16_t vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
-int16x8_t vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
-int32x4_t vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
-int64x2_t vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
-float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
-uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
-uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
-uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
-uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
 //Vector saturating subtract
-
-int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
-int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
-int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
-int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
-uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
-uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.U16 q0,q0,q0
-uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
-uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b);         // VQSUB.U64 q0,q0,q0
+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
 //Vector halving subtract
-
-int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
-int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
-int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
-uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
-uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.U16 q0,q0,q0
-uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
 //Vector subtract high half
-
+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
 //Vector rounding subtract high half
-
+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
 //Comparison
 //Vector compare equal
-
-uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
-uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
-uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
-uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
-uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
-uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
-uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
-uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
 //Vector compare greater-than or equal
-
-uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
-uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
-uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 //Vector compare less-than or equal
-
-uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
-uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
-uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 //Vector compare greater-than
-
-uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
-uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
 //Vector compare less-than
-
-uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
-uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
 //Vector compare absolute greater-than or equal
-
-uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
 //Vector compare absolute less-than or equal
-
-uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
 //Vector compare absolute greater-than
-
-uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
 //Vector compare absolute less-than
-
-uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
 //Vector test bits
-
-uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
-uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
-uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
-uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
-uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
-uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
-uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
 //Absolute difference
 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
-
-int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
-int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
-int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
-uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
-uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.U16 q0,q0,q0
-uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
-float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
 //Absolute difference - long
-
+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
-
-int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
-int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
-int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
-uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
-uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.U16 q0,q0,q0
-uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
 //Absolute difference and accumulate - long
-
+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
 //Max/Min
 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
-
-int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
-int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
-int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
-uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
-uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.U16 q0,q0,q0
-uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
-float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
-
-int8x16_t vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
-int16x8_t vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
-int32x4_t vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
-uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
-uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.U16 q0,q0,q0
-uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
-float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
 //Pairwise addition
 //Pairwise add
-
+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
 //Long pairwise add
-
-int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
-int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
-int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
-uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
-uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.U16 q0,q0
-uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
 //Long pairwise add and accumulate
-
-int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
-int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
-int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
-uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
-uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.U16 q0,q0
-uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
 //Folding maximum vpmax -> takes maximum of adjacent pairs
-
+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
 //Folding minimum vpmin -> takes minimum of adjacent pairs
-
+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
 //Reciprocal/Sqrt
-
-float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
-
-float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
 //Shifts by signed variable
 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
-
-int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
-int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
-int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
-int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
-uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
-uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.U16 q0,q0,q0
-uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
-uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
 //Vector saturating shift left: (negative values shift right)
-
-int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
-int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
-int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
-int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
-uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
-uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.U16 q0,q0,q0
-uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
-uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
 //Vector rounding shift left: (negative values shift right)
-
-int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
-int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
-int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
-int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
-uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
-uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.U16 q0,q0,q0
-uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
-uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
 //Vector saturating rounding shift left: (negative values shift right)
-
-int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
-int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
-int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
-int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
-uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
-uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.U16 q0,q0,q0
-uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
-uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
 //Shifts by a constant
 //Vector shift right by constant
-
-int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
-int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
-int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
-int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
-uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
-uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.U16 q0,q0,#16
-uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
-uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
 //Vector shift left by constant
-
-int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
-int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
-int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
-int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
-uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
-uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
-uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
-uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 //Vector rounding shift right by constant
-
-int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
-int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
-int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
-int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
-uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
-uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.U16 q0,q0,#16
-uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
-uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
 //Vector shift right by constant and accumulate
-
-int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
-int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
-int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
-int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
-uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
-uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.U16 q0,q0,#16
-uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
-uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
 //Vector rounding shift right by constant and accumulate
-
-int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
-int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
-int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
-int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
-uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
-uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.U16 q0,q0,#16
-uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
-uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
 //Vector saturating shift left by constant
-
-int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
-int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
-int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
-int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
-uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
-uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.U16 q0,q0,#0
-uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
-uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
 //Vector signed->unsigned saturating shift left by constant
-
-uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
-uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
-uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
-uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
 //Vector narrowing shift right by constant
-
+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
 //Vector signed->unsigned narrowing saturating shift right by constant
-
+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
 //Vector signed->unsigned rounding narrowing saturating shift right by constant
-
+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
 //Vector narrowing saturating shift right by constant
-
+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
 //Vector rounding narrowing shift right by constant
-
+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
 //Vector rounding narrowing saturating shift right by constant
-
+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
 //Vector widening shift left by constant
-
+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
 //Shifts with insert
 //Vector shift right and insert
-
-int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
-uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
-poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 //Vector shift left and insert
-
-int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
 //Load a single vector from memory
-uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
-float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
 //Load a single lane from memory
-uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
-uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
-uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
-int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         //VLD1.16 {d0[0]}, [r0]
-int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         //VLD1.32 {d0[0]}, [r0]
-float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
-int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         //VLD1.64 {d0}, [r0]
-poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
-
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
 //Load all lanes of vector with same value from memory
-uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
-int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
-float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
-float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
 //Store a single vector into memory
-void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
-void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
-void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
-
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
 //Store a lane of a vector into memory
 //Loads of an N-element structure
 //Load N-element structure from memory
-uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
-float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 //Load all lanes of N-element structure with same value from memory
-uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 //Load a single lane of N-element structure from memory
 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
-uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
-uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         //VLD2.16 {d0[0], d1[0]}, [r0]
-int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         //VLD2.32 {d0[0], d1[0]}, [r0]
+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
-uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);         //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);         //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 //Store N-element structure to memory
-void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
-void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
-void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
-void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
-void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
-void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
 //Store a single lane of N-element structure to memory
-void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
-void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
-void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane);         //VST2.32 {d0[0], d2[0]}, [r0]
-void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
-void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8 {d0[0],d1[0]}, [r0]
-void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
-void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane);         //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
-void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane);         //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
-
-uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
-uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
-uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
-int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
-int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
-poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
-float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-
-int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
-uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
-
-uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-
-int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
-uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 //Initialize a vector from a literal bit pattern.
-
+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
 //Set all lanes to same value
 //Load all lanes of vector to the same literal value
-
-uint8x16_t vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
-uint16x8_t vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
-uint32x4_t vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
-int8x16_t vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
-int16x8_t vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
-int32x4_t vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
-poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
-poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
-float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
-
-int64x2_t vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
-uint64x2_t vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
-
-uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
-uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
-uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
-int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
-int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
-int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
-poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
-poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
-float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
-
-int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
-uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
 //Load all lanes of the vector to the value of a lane of a vector
-
+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
-
+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-
+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
 //Converting vectors. These intrinsics are used to convert vectors.
 //Convert from float
-
-int32x4_t vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
-uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
-
-int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
-uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
 //Convert to float
-
-float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
-float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
-
-float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b);         // VCVT.F32.S32 q0, q0, #32
-float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b);         // VCVT.F32.U32 q0, q0, #32
+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
 //Convert between floats
-
+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
 //Vector narrow integer
-
+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
 //Vector long move
-
+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
 //Vector saturating narrow integer
-
+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
 //Vector saturating narrow integer signed->unsigned
-
+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
 //Table look up
-
+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
 //Extended table look up intrinsics
-
+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
 //Operations with a scalar value
 //Vector multiply accumulate with scalar
-
+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
 //Vector widening multiply accumulate with scalar
-
+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
 //Vector widening saturating doubling multiply accumulate with scalar
-
+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
 //Vector multiply subtract with scalar
-
+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
 //Vector widening multiply subtract with scalar
-
+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
 //Vector widening saturating doubling multiply subtract with scalar
-
+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
 //Vector multiply by scalar
-
-int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
-int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
-float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
-uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
-uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
 //Vector long multiply with scalar
-
+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
 //Vector long multiply by scalar
-
+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
 //Vector saturating doubling long multiply with scalar
-
+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
 //Vector saturating doubling long multiply by scalar
-
+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
 //Vector saturating doubling multiply high with scalar
-
-int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQDMULH.S16 q0,q0,d0[0]
-int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQDMULH.S32 q0,q0,d0[0]
+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
 //Vector saturating doubling multiply high by scalar
-
+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
 //Vector saturating rounding doubling multiply high with scalar
-
-int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
-int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
 //Vector rounding saturating doubling multiply high by scalar
-
+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
 //Vector multiply accumulate with scalar
-
-int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
-int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
-uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
-uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
-float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
 //Vector widening multiply accumulate with scalar
-
+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
 //Vector widening saturating doubling multiply accumulate with scalar
-
+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
 //Vector multiply subtract with scalar
-
-int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
-int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
-uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
-uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
-float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
 //Vector widening multiply subtract with scalar
-
+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
 //Vector widening saturating doubling multiply subtract with scalar
-
+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
 //Vector extract
-
-int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
-uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
-int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
-uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-
-int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
-int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
-int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
-uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
-uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
-uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
-poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
-poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
-float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
-
-int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
-int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
-uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
-uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
-poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
-
-int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
-uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
-poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
 //Other single operand arithmetic
 //Absolute: Vd[i] = |Va[i]|
-
-int8x16_t vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
-int16x8_t vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
-int32x4_t vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
-float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
 //Saturating absolute: Vd[i] = sat(|Va[i]|)
-
-int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
-int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
-int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
 //Negate: Vd[i] = - Va[i]
-
-int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
-int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
-int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
-float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
 //Saturating Negate: sat(Vd[i] = - Va[i])
-
-int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
-int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
-int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
 //Count leading sign bits
-
-int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
-int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
-int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
 //Count leading zeros
-
-int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
-int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
-int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
-uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
-uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
-uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
 //Count number of set bits
-
-uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
-int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
-poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
 //Reciprocal estimate
-
-float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
-uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
 //Reciprocal square root estimate
-
-float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
-uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
 //Logical operations
 //Bitwise not
-
-int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
-int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
-int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
-uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
-uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
-uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
-poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
 //Bitwise and
-
-int8x16_t vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
-int16x8_t vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
-int32x4_t vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
-int64x2_t vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
-uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
-uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
-uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
-uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
 //Bitwise or
-
-int8x16_t vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
-int16x8_t vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
-int32x4_t vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
-int64x2_t vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
-uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
-uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
-uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
-uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
 //Bitwise exclusive or (EOR or XOR)
-
-int8x16_t veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
-int16x8_t veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
-int32x4_t veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
-int64x2_t veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
-uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
-uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
-uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
-uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
 //Bit Clear
-
-int8x16_t vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
-int16x8_t vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
-int32x4_t vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
-int64x2_t vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
-uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
-uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
-uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
-uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
 //Bitwise OR complement
-
-int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
-int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
-int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
-int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
-uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
-uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
-uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
-uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
 //Bitwise Select
-
-int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
-int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
-int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
-int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
-uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
-uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
-uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
-uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
-float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
-poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
-poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
 //Transposition operations
 //Transpose elements
-
-int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b);         // VTRN.8 q0,q0
-int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
-int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
-uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
-uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
-uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
-float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
-poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
-poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
 //Interleave elements
-
-int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
-int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
-int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
-uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
-uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
-uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
-float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
-poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
-poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
 //De-Interleave elements
+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
 
-int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
-int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
-int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
-uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
-uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
-uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
-float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
-poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
-poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
 
 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
 //
-#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined __INTEL_COMPILER )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
 
-    #if defined(USE_SSSE3)
-        #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
-    #endif
+    #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
 
     #define _MM_EXTRACT_EPI16  _mm_extract_epi16
     #define _MM_INSERT_EPI16 _mm_insert_epi16
-    #ifdef USE_SSE4
+#ifdef USE_SSE4
         #define _MM_EXTRACT_EPI8  _mm_extract_epi8
         #define _MM_EXTRACT_EPI32  _mm_extract_epi32
         #define _MM_EXTRACT_PS  _mm_extract_ps
@@ -1435,11 +2257,11 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         #define _MM_INSERT_EPI8  _mm_insert_epi8
         #define _MM_INSERT_EPI32 _mm_insert_epi32
         #define _MM_INSERT_PS    _mm_insert_ps
-    #ifdef  _M_X64
+#ifdef  _NEON2SSE_64BIT
             #define _MM_INSERT_EPI64 _mm_insert_epi64
             #define _MM_EXTRACT_EPI64 _mm_extract_epi64
-    #endif
-    #endif     //SSE4
+#endif
+#endif //SSE4
 #else
     #define _NEON2SSE_COMMA ,
     #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
@@ -1488,12 +2310,10 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         default:     return NAME(vec p,case0); \
         }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
     {
         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
     }
-    #endif
 
     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
     {
@@ -1505,7 +2325,7 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
     }
 
-    #ifdef USE_SSE4
+#ifdef USE_SSE4
         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
         {
             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
@@ -1530,11 +2350,12 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         {
             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
         }
-    #ifdef  _M_X64
+
+#ifdef  _NEON2SSE_64BIT
+            //the special case of functions available only for SSE4 and 64-bit build.
             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
             {
-                switch(LANE)
-                {
+                switch(LANE) {
                 case 0:
                     return _mm_insert_epi64(vec,  p, 0);
                 case 1:
@@ -1549,13 +2370,14 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
                 if (LANE ==0) return _mm_extract_epi64(val, 0);
                 else return _mm_extract_epi64(val, 1);
             }
-    #endif
+#endif
+
         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
         {
             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
         }
 
-    #endif     //USE_SSE4
+#endif //USE_SSE4
 
 #endif     //#ifdef NDEBUG
 
@@ -1587,6 +2409,8 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
 
     #define _MM_MULLO_EPI32 _mm_mullo_epi32
     #define _MM_MUL_EPI32  _mm_mul_epi32
+
+    #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
 #else     //no SSE4 !!!!!!
     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
     {
@@ -1655,8 +2479,8 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         __m128i vec_masked, p_masked;
         pvec[LANE] = p;
         mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
         return _mm_or_si128(vec_masked, p_masked);
     }
 
@@ -1667,8 +2491,8 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         __m128i vec_masked, p_masked;
         pvec[LANE] = (int8_t)p;
         mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
         return _mm_or_si128(vec_masked, p_masked);
     }
 
@@ -1676,9 +2500,9 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
     {
         _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
         __m128 tmp, vec_masked, p_masked;
-        mask[LANE >> 4] = 0x0;         //here the LANE is not actural lane, need to deal with it
-        vec_masked = _mm_and_ps (*(__m128*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_ps (*(__m128*)mask, p);         //ready for vec
+        mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
+        vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
+        p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
         tmp = _mm_or_ps(vec_masked, p_masked);
         return tmp;
     }
@@ -1704,11 +2528,11 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
     {
         __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
         b_s = _mm_sub_epi16 (b, c8000);
         a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1717,11 +2541,11 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
     {
         __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
         b_s = _mm_sub_epi32 (b, c80000000);
         a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1748,11 +2572,11 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
     {
         __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
         b_s = _mm_sub_epi16 (b, c8000);
         a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1761,25 +2585,25 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
     {
         __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
         b_s = _mm_sub_epi32 (b, c80000000);
         a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
     }
 
-    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask)         //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
-    {         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
+    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
+    {
+        //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
         __m128i a_masked, b_masked;
-        b_masked = _mm_and_si128 (mask,b);         //use b if mask 0xff
+        b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
         a_masked = _mm_andnot_si128 (mask,a);
         return _mm_or_si128(a_masked, b_masked);
     }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
     {
         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
@@ -1787,29 +2611,27 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         zero = _mm_setzero_si128();
         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
-        res = _mm_unpacklo_epi64(a16, b16);         //result without saturation
-        reshi = _mm_unpackhi_epi64(a16, b16);         //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp,res);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
-        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+        res = _mm_unpacklo_epi64(a16, b16); //result without saturation
+        reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
     }
-    #endif
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
     {
         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
         __m128i a16, res, reshi,cmp, zero;
         zero = _mm_setzero_si128();
         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
-        reshi = _mm_unpackhi_epi64(a16, a16);         //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp, a16);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
-        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+        reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
     }
-    #endif
+
 
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
     {
@@ -1825,61 +2647,71 @@ poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
         return _mm_load_si128((__m128i*)res);
     }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
     {
         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
         sign = _mm_xor_si128 (a, b);
-        sign =  _mm_srai_epi32 (sign, 31);         //promote sign bit to all fields, all fff if negative and all 0 if positive
+        sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
         zero = _mm_setzero_si128();
-        a_neg = _mm_abs_epi32 (a);         //negate a and b
-        b_neg = _mm_abs_epi32 (b);         //negate a and b
-        mul_us = _mm_mul_epu32 (a_neg, b_neg);         //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
+        a_neg = _mm_abs_epi32 (a); //negate a and b
+        b_neg = _mm_abs_epi32 (b); //negate a and b
+        mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
         mul_us_neg = _mm_sub_epi64(zero, mul_us);
         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
         mul_us = _mm_andnot_si128(sign, mul_us);
         return _mm_or_si128 (mul_us, mul_us_neg);
     }
-    #endif
-#endif     //SSE4
 
-#ifndef _MM_INSERT_EPI64     //special case of SSE4 and  _M_X64
-    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
-        _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff,0xffffffffffffffff};
-        __m128i vec_masked, p_masked;
-        pvec[LANE] = p;
-        mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
-        return _mm_or_si128(vec_masked, p_masked);
-    }
-#endif
-#ifndef _MM_EXTRACT_EPI64     //special case of SSE4 and  _M_X64
-    _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
+    _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
     {
-        _NEON2SSE_ALIGN_16 int64_t tmp[2];
-        _mm_store_si128((__m128i*)tmp, val);
-        return tmp[LANE];
+        __m128i res;
+        res = _mm_cmpeq_epi32 (a, b);
+        return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
     }
+#endif     //SSE4
+
+//the special case of functions working only for 32 bits, no SSE4
+_NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
+    _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
+    __m128i vec_masked, p_masked;
+    pvec[LANE] = p;
+    mask[LANE] = 0x0;
+    vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+    p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+    return _mm_or_si128(vec_masked, p_masked);
+}
+
+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 int64_t tmp[2];
+    _mm_store_si128((__m128i*)tmp, val);
+    return tmp[LANE];
+}
+
+#ifndef _NEON2SSE_64BIT_SSE4
+    #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
+    #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
 #endif
 
-int32x4_t  vqd_s32(int32x4_t a);         //Doubling saturation for signed ints
+int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
-{         //Overflow happens only if a and sum have the opposite signs
+{
+    //Overflow happens only if a and sum have the opposite signs
     __m128i c7fffffff, res, res_sat, res_xor_a;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
-    res = _mm_slli_epi32 (a, 1);         // res = a*2
+    res = _mm_slli_epi32 (a, 1); // res = a*2
     res_sat = _mm_srli_epi32(a, 31);
     res_sat = _mm_add_epi32(res_sat, c7fffffff);
     res_xor_a = _mm_xor_si128(res, a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
+
 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 //*************************************************************************
 //*************************************************************************
@@ -1892,6 +2724,7 @@ _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
 #ifdef ARM
 #define vector_addq_s32 _mm_add_epi32
 #else //if we have IA
+#define vector_addq_s32 vadd_s32
 #endif
 
 ********************************************************************************************
@@ -1902,54 +2735,264 @@ Each NEON intrinsic function has one of the following options:
 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
-the serial implementation is provided along with the corresponding compiler warnin//these functions are on your app critical path
+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
 - please consider such functions removal from your code.
 */
 
 //***********************************************************************
 //************************      Vector add   *****************************
 //***********************************************************************
+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
+    return res64;
+}
+
+
+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+#define vadd_u8 vadd_s8
+
+uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+#define vadd_u16 vadd_s16
 
-int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
+uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+#define vadd_u32 vadd_s32
+
+uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
+{
+    uint64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
+    return res64;
+}
+
+
+int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
 #define vaddq_s8 _mm_add_epi8
 
-int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
+int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
 #define vaddq_s16 _mm_add_epi16
 
-int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
+int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
 #define vaddq_s32 _mm_add_epi32
 
-int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
+int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
 #define vaddq_s64 _mm_add_epi64
 
-float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
 #define vaddq_f32 _mm_add_ps
 
-uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
+uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
 #define vaddq_u8 _mm_add_epi8
 
-uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
+uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
 #define vaddq_u16 _mm_add_epi16
 
-uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
+uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
 #define vaddq_u32 _mm_add_epi32
 
-uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
 #define vaddq_u64 _mm_add_epi64
 
 //**************************** Vector long add *****************************:
 //***********************************************************************
 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a16, b16);
+}
+
+int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 ( a64, b64);
+}
+
+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a16, b16);
+}
+
+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a64, b64);
+}
 
 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
 //*************** *********************************************************************
+int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a, b16);
+}
+
+int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
+    return _mm_add_epi32 (a, b32);
+}
+
+int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a, b16);
+}
+
+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a, b32);
+}
+
+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
 
 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
 //*************************************************************************************************************************
+int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
+
 
-int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
+int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1957,9 +3000,10 @@ _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
     return _mm_add_epi8(tmp1,tmp2);
 }
 
-int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S1 6 q0,q0,q0
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1967,9 +3011,10 @@ _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
     return _mm_add_epi16(tmp1,tmp2);
 }
 
-int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b)         // VHADD.S32 q0,q0,q0
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1977,31 +3022,32 @@ _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b)         // VHADD
     return _mm_add_epi32(tmp1,tmp2);
 }
 
-uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b)         // VHADD.U8 q0,q0,q0
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
 {
     __m128i c1, sum, res;
     c1 = _mm_set1_epi8(1);
-    sum = _mm_avg_epu8(a, b);         //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b);         //for rounding compensation
-    res = _mm_and_si128(res,c1);         //for rounding compensation
-    return _mm_sub_epi8 (sum, res);         //actual rounding compensation
+    sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_and_si128(res,c1); //for rounding compensation
+    return _mm_sub_epi8 (sum, res); //actual rounding compensation
 }
 
-uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b)         // VHADD.s16 q0,q0,q0
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
 {
     __m128i sum, res;
-    sum = _mm_avg_epu16(a, b);         //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b);         //for rounding compensation
-    res = _mm_slli_epi16 (res,15);         //shift left  then back right to
-    res = _mm_srli_epi16 (res,15);         //get 1 or zero
-    return _mm_sub_epi16 (sum, res);         //actual rounding compensation
+    sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_slli_epi16 (res,15); //shift left  then back right to
+    res = _mm_srli_epi16 (res,15); //get 1 or zero
+    return _mm_sub_epi16 (sum, res); //actual rounding compensation
 }
 
-uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b)         // VHADD.U32 q0,q0,q0
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -2011,77 +3057,202 @@ _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b)         // VH
 
 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
 //*****************************************************************************************************************************
+int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
+}
 
-//SSE, result rounding!!!
 
-//SSE, result rounding!!!
+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
 
-int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b)         // VRHADD.S8 q0,q0,q0
-{         //no signed average in x86 SIMD, go to unsigned
+
+uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
     __m128i c128, au, bu, sum;
-    c128 = _mm_set1_epi8(128);
-    au = _mm_add_epi8(a, c128);
-    bu = _mm_add_epi8(b, c128);
+    c128 = _mm_set1_epi8(0x80); //-128
+    au = _mm_sub_epi8(a, c128); //add 128
+    bu = _mm_sub_epi8(b, c128); //add 128
     sum = _mm_avg_epu8(au, bu);
-    return _mm_sub_epi8 (sum, c128);
+    return _mm_add_epi8 (sum, c128); //sub 128
 }
 
-int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b)         // VRHADD.S16 q0,q0,q0
-{         //no signed average in x86 SIMD, go to unsigned
+int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
     __m128i cx8000, au, bu, sum;
-    cx8000 = _mm_set1_epi16(0x8000);
-    au = _mm_add_epi16(a, cx8000);
-    bu = _mm_add_epi16(b, cx8000);
+    cx8000 = _mm_set1_epi16(0x8000); // - 32768
+    au = _mm_sub_epi16(a, cx8000); //add 32768
+    bu = _mm_sub_epi16(b, cx8000); //add 32768
     sum = _mm_avg_epu16(au, bu);
-    return _mm_sub_epi16 (sum, cx8000);
+    return _mm_add_epi16 (sum, cx8000); //sub 32768
 }
 
-int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
+int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
-{         //need to avoid overflow
+{
+    //need to avoid overflow
     __m128i a2, b2, res, sum;
-    a2 = _mm_srai_epi32(a,1);         //a2=a/2;
-    b2 = _mm_srai_epi32(b,1);         // b2=b/2;
-    res = _mm_or_si128(a,b);         //for rounding
-    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
-    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    a2 = _mm_srai_epi32(a,1); //a2=a/2;
+    b2 = _mm_srai_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
     sum = _mm_add_epi32(a2,b2);
     return _mm_add_epi32(sum,res);
 }
 
-uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
-#define vrhaddq_u8 _mm_avg_epu8         //SSE2, results rounded
+uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
+
+uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
 
-uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.s16 q0,q0,q0
-#define vrhaddq_u16 _mm_avg_epu16         //SSE2, results rounded
 
-uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b)         // VRHADD.U32 q0,q0,q0
-{         //need to avoid overflow
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
+{
+    //need to avoid overflow
     __m128i a2, b2, res, sum;
-    a2 = _mm_srli_epi32(a,1);         //a2=a/2;
-    b2 = _mm_srli_epi32(b,1);         // b2=b/2;
-    res = _mm_or_si128(a,b);         //for rounding
-    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
-    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    a2 = _mm_srli_epi32(a,1); //a2=a/2;
+    b2 = _mm_srli_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
     sum = _mm_add_epi32(a2,b2);
     return _mm_add_epi32(sum,res);
 }
 
 //****************** VQADD: Vector saturating add ************************
 //************************************************************************
+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int64x1_t res;
+    uint64_t a64, b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    a64 = (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
+
+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t a64, b64;
+    uint64x1_t res;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    if (res.m64_u64[0] < a64) {
+        res.m64_u64[0] = ~(uint64_t)0;
+    }
+    return res;
+}
+
+int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
 #define vqaddq_s8 _mm_adds_epi8
 
-int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
+int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
 #define vqaddq_s16 _mm_adds_epi16
 
-int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
+int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
-{         //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
     res = _mm_add_epi32(a, b);
@@ -2090,13 +3261,13 @@ _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
     res_xor_a = _mm_xor_si128(res, a);
     b_xor_a_ = _mm_xor_si128(b, a);
     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
-int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
+int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
@@ -2117,13 +3288,13 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a,
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
+uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
 #define vqaddq_u8 _mm_adds_epu8
 
-uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.s16 q0,q0,q0
+uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
 #define vqaddq_u16 _mm_adds_epu16
 
-uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
 {
     __m128i c80000000, cmp, subsum, suba, sum;
@@ -2131,11 +3302,11 @@ _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
     sum = _mm_add_epi32 (a, b);
     subsum = _mm_sub_epi32 (sum, c80000000);
     suba = _mm_sub_epi32 (a, c80000000);
-    cmp = _mm_cmpgt_epi32 ( suba, subsum);         //no unsigned comparison, need to go to signed
-    return _mm_or_si128 (sum, cmp);         //saturation
+    cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
+    return _mm_or_si128 (sum, cmp); //saturation
 }
 
-uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
 #ifdef USE_SSE4
     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
     {
@@ -2144,8 +3315,8 @@ uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
         sum = _mm_add_epi64 (a, b);
         subsum = _mm_sub_epi64 (sum, c80000000);
         suba = _mm_sub_epi64 (a, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subsum);         //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_or_si128 (sum, cmp);         //saturation
+        cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_or_si128 (sum, cmp); //saturation
     }
 #else
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
@@ -2161,11 +3332,142 @@ uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
     }
 #endif
 
+
 //******************* Vector add high half (truncated)  ******************
 //************************************************************************
+int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srai_epi16 (sum, 8);
+    sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srai_epi32(sum, 16);
+    sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sum;
+    sum = _mm_add_epi64 (a, b);
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sum);
+}
+
+uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srli_epi16 (sum, 8);
+    sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
+    return64(sum);
+}
+
+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srli_epi32 (sum, 16);
+    sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
+    return64(sum);
+}
+
+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+#define vaddhn_u64 vaddhn_s64
 
 //*********** Vector rounding add high half: vraddhn_<type> ******************.
 //***************************************************************************
+int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi16 (sum, sum);
+    return64(sum);
+}
+
+int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi32 (sum, sum);
+    return64(sum);
+}
+
+int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
+    return64(sum);
+}
+
+uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packus_epi16 (sum, sum);
+    return64(sum);
+}
+
+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _MM_PACKUS1_EPI32 (sum);
+    return64(sum);
+}
+
+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+#define vraddhn_u64 vraddhn_s64
 
 //**********************************************************************************
 //*********             Multiplication            *************************************
@@ -2173,79 +3475,163 @@ uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
 
 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
 //As we don't go to wider result functions are equal to "multiply low" in x86
+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    int8x8_t res64;
+    __m128i a128, b128, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
+    return64(res);
+}
+
+int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
+#define vmul_s16 vmul_u16
 
-#if defined(USE_SSSE3)
-int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b)         // VMUL.I8 q0,q0,q0
-{         // no 8 bit simd multiply, need to go to 16 bits
-      //solution may be not optimal
+int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
+#define vmul_s32 vmul_u32
+
+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t tmp;
+    __m64_128 res64;
+    tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
+    _M64f(res64, tmp); //use low 64 bits
+    return res64;
+}
+
+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    uint8x8_t res64;
+    __m128i mask, a128, b128, res;
+    mask = _mm_set1_epi16(0xff);
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use only low 64 bits
+    return64(res);
+}
+
+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
+    res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
+    return res;
+}
+
+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    poly8x8_t res64;
+    __m128i a64, b64, c1, res, tmp, bmasked;
+    int i;
+    a64 = _pM128i(a);
+    b64 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b64, c1); //0x1
+    res = vmulq_u8(a64, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b64, c1); //0x1
+        tmp = vmulq_u8(a64, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return64 (res);
+}
+
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
     __m128i a16, b16, r16_1, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    a16 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
+    a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (a16, b16);
     //swap hi and low part of a and b to process the remaining data
     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1  __m128i r16_2
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
 
     r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
 
     return _mm_unpacklo_epi64(r16_1,  r16_2);
 }
-#endif
 
-int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
+int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
 #define vmulq_s16 _mm_mullo_epi16
 
-int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
-#define vmulq_s32 _MM_MULLO_EPI32         //SSE4.1
+int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
 
-float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
 #define vmulq_f32 _mm_mul_ps
 
-uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b)         // VMUL.I8 q0,q0,q0
-{         // no 8 bit simd multiply, need to go to 16 bits
-      //solution may be not optimal
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
     __m128i maskff, a16, b16, r16_1, r16_2;
     maskff = _mm_set1_epi16(0xff);
-    a16 = _MM_CVTEPU8_EPI16 (a);         // SSE 4.1
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
+    a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_and_si128(r16_1, maskff);         //to avoid saturation
+    r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
     //swap hi and low part of a and b to process the remaining data
     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_2 = _mm_and_si128(r16_2, maskff);         //to avoid saturation
+    r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
     return _mm_packus_epi16 (r16_1,  r16_2);
 }
 
-uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
+uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
 #define vmulq_u16 _mm_mullo_epi16
 
-uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
-#define vmulq_u32 _MM_MULLO_EPI32         //SSE4.1
+uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
 
-poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
-{         //may be optimized
+{
+    //may be optimized
     __m128i c1, res, tmp, bmasked;
     int i;
-    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
-    c1 = vshrq_n_u8(c1,7);         //0x1
-    bmasked = _mm_and_si128(b, c1);         //0x1
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b, c1); //0x1
     res = vmulq_u8(a, bmasked);
     for(i = 1; i<8; i++) {
-        c1 = _mm_slli_epi16(c1,1);         //shift mask left by 1, 16 bit shift is OK here
-        bmasked = _mm_and_si128(b, c1);         //0x1
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b, c1); //0x1
         tmp = vmulq_u8(a, bmasked);
         res = _mm_xor_si128(res, tmp);
     }
@@ -2254,111 +3640,404 @@ _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
 
 //************************* Vector long multiply ***********************************
 //****************************************************************************
+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
+{
+    #ifdef USE_SSE4
+        __m128i a16, b16;
+        a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
+        b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
+        return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+    #else
+        __m128i low, hi, a128,b128;
+        a128 = _pM128i(a);
+        b128 = _pM128i(b);
+        low =  _mm_mullo_epi16(a128,b128);
+        hi =   _mm_mulhi_epi16(a128,b128);
+        return _mm_unpacklo_epi16(low,hi);
+    #endif
+}
+
+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
+{
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
+{
+    #ifdef USE_SSE4
+        __m128i a16, b16;
+        a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
+        b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
+        return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+    #else
+        __m128i a128,b128,low, hi;
+        a128 = _pM128i(a);
+        b128 = _pM128i(b);
+        low =  _mm_mullo_epi16(a128,b128);
+        hi =   _mm_mulhi_epu16(a128,b128);
+        return _mm_unpacklo_epi16(low,hi);
+    #endif
+}
+
+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
+{
+    ///may be not optimal compared with serial implementation
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
+    int i;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b128, c1); //0x1
+
+    a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
+    bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+    res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b128, c1); //0x1
+        bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+        tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
 
 //****************Vector saturating doubling long multiply **************************
 //*****************************************************************
+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s16(a, b);
+    return vqd_s32(res);
+}
+
+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s32(a,b);
+    return vqaddq_s64(res,res); //slow serial function!!!!
+}
 
 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
 //******************************************************************************************
+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    int8x8_t res64;
+    __m128i b128, c128, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
+    res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
+    res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_add_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i mask, b128, c128, res;
+    mask = _mm_set1_epi16(0xff);
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res, res);
+    res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLA.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+#define vmla_u16 vmla_s16
+
+uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+#define vmla_u32 vmla_s32
+
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2,r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_add_epi8 (r16_1, a);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_add_epi8(r16_2, a_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLA.I16 q0,q0,q0
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
 {
     __m128i res;
     res = _mm_mullo_epi16 (c, b);
     return _mm_add_epi16 (res, a);
 }
 
-int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLA.I32 q0,q0,q0
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
 {
     __m128i res;
-    res = _MM_MULLO_EPI32 (c,  b);         //SSE4.1
+    res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
     return _mm_add_epi32 (res, a);
 }
 
-float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLA.F32 q0,q0,q0
-{         //fma is coming soon, but right now:
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
+{
+    //fma is coming soon, but right now:
     __m128 res;
     res = _mm_mul_ps (c, b);
     return _mm_add_ps (a, res);
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLA.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_add_epi8 (r16_1, a);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_add_epi8(r16_2, a_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
 #define vmlaq_u16 vmlaq_s16
 
-uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
 #define vmlaq_u32 vmlaq_s32
 
 //**********************  Vector widening multiply accumulate (long multiply accumulate):
 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
 //********************************************************************************************
+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_add_epi32 (res, a);
+}
+
+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b, c);
+    return _mm_add_epi64 (res, a);
+}
+
+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_add_epi32 (res, a);
+}
+
+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_add_epi64 (res, a);
+}
 
 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
 //********************************************************************************************
+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    int8x8_t res64;
+    __m128i res;
+    res64 = vmul_s8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
+    res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
+    return64(res);
+}
+
+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_sub_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i res;
+    res64 = vmul_u8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+#define vmls_u16 vmls_s16
+
+uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+#define vmls_u32 vmls_s32
 
-#if defined(USE_SSSE3)
-int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLS.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
     r16_1 = _mm_sub_epi8 (a, r16_1);
@@ -2366,205 +4045,573 @@ _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c)
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_sub_epi8 (a_2, r16_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLS.I16 q0,q0,q0
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
 {
     __m128i res;
     res = _mm_mullo_epi16 (c, b);
     return _mm_sub_epi16 (a, res);
 }
 
-int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLS.I32 q0,q0,q0
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
 {
     __m128i res;
-    res = _MM_MULLO_EPI32 (c, b);         //SSE4.1
+    res = _MM_MULLO_EPI32 (c, b); //SSE4.1
     return _mm_sub_epi32 (a, res);
 }
 
-float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLS.F32 q0,q0,q0
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
 {
     __m128 res;
     res = _mm_mul_ps (c, b);
     return _mm_sub_ps (a, res);
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLS.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_sub_epi8 (a, r16_1);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_sub_epi8(a_2, r16_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
 #define vmlsq_u16 vmlsq_s16
 
-uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
 #define vmlsq_u32 vmlsq_s32
 
 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
 //*************************************************************************************************************
+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_sub_epi32 (a, res);
+}
+
+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_sub_epi32 (a, res);
+}
+
+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
 
 //******  Vector saturating doubling multiply high **********************
 //*************************************************************************
-//For some ARM implementations if the multiply high result is all 0xffffffff then it is not doubled. We do the same here
+int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int32_t a32, b32, i;
+    for (i = 0; i<4; i++) {
+        a32 = (int32_t) a.m64_i16[i];
+        b32 = (int32_t) b.m64_i16[i];
+        a32 = (a32 * b32) >> 15;
+        res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
+    }
+    return res;
+}
+
+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    __m128i mask;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    mul = _mm_slli_epi64(mul,1); //double the result
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
 
-int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b)         // VQDMULH.S16 q0,q0,q0
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
 {
-    __m128i res_sat, cffff, mask, res;
+    __m128i res, res_lo, mask;
+    _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
     res = _mm_mulhi_epi16 (a, b);
-    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
-    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
-    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
-    return _mm_or_si128(mask, res_sat);
+    res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
+    res_lo = _mm_mullo_epi16 (a, b);
+    res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
+    res = _mm_add_epi16(res, res_lo); //combine results
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
 }
 
-#if defined(USE_SSSE3)
-int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1;
-    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, mask, mul, mul1;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64(mul,1); //double the result
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64(mul1,1); //double the result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
     mul = _mm_unpacklo_epi64(mul, mul1);
-    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
-    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
-    res_sat = vqd_s32(mul);
-    return _mm_or_si128(mask, res_sat);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
 }
-#endif
 
 //********* Vector saturating rounding doubling multiply high ****************
 //****************************************************************************
 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
-
-#if defined(USE_SSSE3)
-int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b)         // VQRDMULH.S16 q0,q0,q0
-{
-    __m128i res_sat, cffff, mask, res;
+int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
+}
+
+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128i res_sat, mask, mask1;
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
+
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
+{
+    __m128i mask, res;
+    _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
     res = _mm_mulhrs_epi16 (a, b);
-    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
-    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
-    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
-    return _mm_or_si128(mask, res_sat);
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1, mask1;
-    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba,  mask, mul, mul1, mask1;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (mul, mask1); //actual rounding
+
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
     mul = _mm_unpacklo_epi64(mul, mul1);
-    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
-    mask1 = _mm_slli_epi32(mul, 17);         //shift left then back right to
-    mask1 = _mm_srli_epi32(mul,31);         //get  15-th bit 1 or zero
-    mul = _mm_add_epi32 (mul, mask1);         //actual rounding
-    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
-    res_sat = vqd_s32(mul);
-    return _mm_or_si128(mask, res_sat);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
 }
-#endif
 
 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
 //*************************************************************************************************************************
+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32;
+    res32 = vmull_s16(b,  c);
+    res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
+    return vqaddq_s32(res32, a); //saturation
+}
+
+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64;
+    res64 = vmull_s32(b,c);
+    res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
+    return vqaddq_s64(res64, a); //saturation
+}
 
 //************************************************************************************
 //******************  Vector subtract ***********************************************
 //************************************************************************************
+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
+    return res64;
+}
+
+
+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
+    return res;
+}
+
+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+#define vsub_u8 vsub_s8
+
+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+#define vsub_u16 vsub_s16
 
-int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+#define vsub_u32 vsub_s32
+
+
+uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
+    return res64;
+}
+
+
+int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
 #define vsubq_s8 _mm_sub_epi8
 
-int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
+int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
 #define vsubq_s16 _mm_sub_epi16
 
-int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
+int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
 #define vsubq_s32 _mm_sub_epi32
 
-int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
+int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
 #define vsubq_s64 _mm_sub_epi64
 
-float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
 #define vsubq_f32 _mm_sub_ps
 
-uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
+uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
 #define vsubq_u8 _mm_sub_epi8
 
-uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
+uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
 #define vsubq_u16 _mm_sub_epi16
 
-uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
+uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
 #define vsubq_u32 _mm_sub_epi32
 
-uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
 #define vsubq_u64 _mm_sub_epi64
 
 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
 //***********************************************************************************
 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
 
 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
 //*****************************************************************************************************
+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
 
 //************************Vector saturating subtract *********************************
 //*************************************************************************************
+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
+
+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    uint64x1_t res;
+    uint64_t a64,b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 - b64;
+
+    a64 =  (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint64x1_t res;
+    uint64_t a64, b64;
+    a64 = _Ui64(a);
+    b64 = _Ui64(b);
+    if (a64 > b64) {
+        res.m64_u64[0] = a64 - b64;
+    } else {
+        res.m64_u64[0] = 0;
+    }
+    return res;
+}
+
+int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
 #define vqsubq_s8 _mm_subs_epi8
 
-int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
+int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
 #define vqsubq_s16 _mm_subs_epi16
 
-int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
-{         //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
     res = _mm_sub_epi32(a, b);
@@ -2573,14 +4620,14 @@ _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
     res_xor_a = _mm_xor_si128(res, a);
     b_xor_a = _mm_xor_si128(b, a);
     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
-int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)         //no optimal SIMD soulution
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
 {
     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
     _NEON2SSE_ALIGN_16 uint64_t res[2];
@@ -2597,23 +4644,23 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a,
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
+uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
 #define vqsubq_u8 _mm_subs_epu8
 
-uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.s16 q0,q0,q0
+uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
 #define vqsubq_u16 _mm_subs_epu16
 
-uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b)         // VQSUB.U32 q0,q0,q0
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
 {
     __m128i min, mask, sub;
-    min = _MM_MIN_EPU32(a, b);         //SSE4.1
+    min = _MM_MIN_EPU32(a, b); //SSE4.1
     mask = _mm_cmpeq_epi32 (min,  b);
     sub = _mm_sub_epi32 (a, b);
     return _mm_and_si128 ( sub, mask);
 }
 
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL);         // VQSUB.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
 #ifdef USE_SSE4
     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
     {
@@ -2622,8 +4669,8 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t
         sub  = _mm_sub_epi64 (a, b);
         suba = _mm_sub_epi64 (a, c80000000);
         subb = _mm_sub_epi64 (b, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subb);         //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_and_si128 (sub, cmp);         //saturation
+        cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_and_si128 (sub, cmp); //saturation
     }
 #else
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
@@ -2639,10 +4686,62 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t
 
 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
 //****************************************************************
+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
+{
+    //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
+    int8x8_t res64;
+    __m128i r16;
+    int8x8_t r;
+    r = vsub_s8 (a, b);
+    r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
+    r16 = _mm_srai_epi16 (r16, 1); //SSE2
+    r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
+    return64(r16);
+}
+
+int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+
+int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b)         // VHSUB.S8 q0,q0,q0
-{         // //need to deal with the possibility of internal overflow
+
+uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
+}
+
+uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
+{
+    // //need to deal with the possibility of internal overflow
     __m128i c128, au,bu;
     c128 = _mm_set1_epi8 (128);
     au = _mm_add_epi8( a, c128);
@@ -2650,9 +4749,10 @@ _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b)         // VHSUB.
     return vhsubq_u8(au,bu);
 }
 
-int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b)         // VHSUB.S16 q0,q0,q0
-{         //need to deal with the possibility of internal overflow
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i c8000, au,bu;
     c8000 = _mm_set1_epi16(0x8000);
     au = _mm_add_epi16( a, c8000);
@@ -2660,9 +4760,10 @@ _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b)         // VHSUB
     return vhsubq_u16(au,bu);
 }
 
-int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b)         // VHSUB.S32 q0,q0,q0
-{//need to deal with the possibility of internal overflow
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i a2, b2,r, b_1;
     a2 = _mm_srai_epi32 (a,1);
     b2 = _mm_srai_epi32 (b,1);
@@ -2673,25 +4774,26 @@ _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b)         // VHSUB
     return _mm_sub_epi32(r,b_1);
 }
 
-uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b)         // VHSUB.U8 q0,q0,q0
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
 {
     __m128i avg;
     avg = _mm_avg_epu8 (a, b);
     return _mm_sub_epi8(a, avg);
 }
 
-uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b)         // VHSUB.s16 q0,q0,q0
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
 {
     __m128i avg;
     avg = _mm_avg_epu16 (a, b);
     return _mm_sub_epi16(a, avg);
 }
 
-uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b)         // VHSUB.U32 q0,q0,q0
-{//need to deal with the possibility of internal overflow
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i a2, b2,r, b_1;
     a2 = _mm_srli_epi32 (a,1);
     b2 = _mm_srli_epi32 (b,1);
@@ -2704,44 +4806,260 @@ _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b)         // VH
 
 //******* Vector subtract high half (truncated) ** ************
 //************************************************************
+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srai_epi16 (sum, 8);
+    sum8 = _mm_packs_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srai_epi32 (sum, 16);
+    sum16 = _mm_packs_epi32(sum16,sum16);
+    return64(sum16);
+}
+
+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sub;
+    sub = _mm_sub_epi64 (a, b);
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srli_epi16 (sum, 8);
+    sum8 =  _mm_packus_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srli_epi32 (sum, 16);
+    sum16 =  _MM_PACKUS1_EPI32(sum16);
+    return64(sum16);
+}
+
+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+#define vsubhn_u64 vsubhn_s64
 
 //************ Vector rounding subtract high half *********************
 //*********************************************************************
+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub =  _mm_packs_epi16 (sub, sub);
+    return64(sub);
+}
+
+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub = _mm_packs_epi32 (sub, sub);
+    return64(sub);
+}
+
+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub = _mm_packus_epi16 (sub, sub);
+    return64(sub);
+}
+
+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub =  _MM_PACKUS1_EPI32 (sub);
+    return64(sub);
+}
+
+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+#define vrsubhn_u64 vrsubhn_s64
 
 //*********** Vector saturating doubling multiply subtract long ********************
 //************************************************************************************
+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32, mask;
+    int32x4_t res;
+    _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = vmull_s16(b,  c);
+    res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
+    mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
+    res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s32(a, res32); //saturation
+}
+
+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64, mask;
+    int64x2_t res;
+    _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
+    res = vmull_s32(b,  c);
+    res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
+    mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
+    res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s64(a, res64); //saturation
+}
 
 //******************  COMPARISON ***************************************
 //******************* Vector compare equal *************************************
 //****************************************************************************
+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
+    return64f(res);
+}
+
+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
 
-uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+#define vceq_p8 vceq_u8
+
+
+uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_s8 _mm_cmpeq_epi8
 
-uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
 #define vceqq_s16 _mm_cmpeq_epi16
 
-uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
 #define vceqq_s32 _mm_cmpeq_epi32
 
-uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
     res = _mm_cmpeq_ps(a,b);
-    return *(__m128i*)&res;
+    return _M128i(res);
 }
 
-uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_u8 _mm_cmpeq_epi8
 
-uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
 #define vceqq_u16 _mm_cmpeq_epi16
 
-uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
 #define vceqq_u32 _mm_cmpeq_epi32
 
-uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_p8 _mm_cmpeq_epi8
 
 //******************Vector compare greater-than or equal*************************
@@ -2749,8 +5067,67 @@ uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
 //in IA SIMD no greater-than-or-equal comparison for integers,
 // there is greater-than available only, so we need the following tricks
 
-uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
+{
+    //serial solution looks faster
+    uint32x2_t res64;
+    return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
+}
+
+
+
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi8 ( a, b);
@@ -2758,8 +5135,8 @@ _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b)         // VCGE.S
     return _mm_or_si128  ( m1, m2);
 }
 
-uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi16 ( a, b);
@@ -2767,8 +5144,8 @@ _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b)         // VCGE.
     return _mm_or_si128   ( m1,m2);
 }
 
-uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi32 (a, b);
@@ -2776,21 +5153,22 @@ _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b)         // VCGE.
     return _mm_or_si128   (m1, m2);
 }
 
-uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
-    res = _mm_cmpge_ps(a,b);         //use only 2 first entries
+    res = _mm_cmpge_ps(a,b); //use only 2 first entries
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
-{         //no unsigned chars comparison, only signed available,so need the trick
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu8(a, b);
-        return _mm_cmpeq_epi8(cmp, a);         //a>=b
+        return _mm_cmpeq_epi8(cmp, a); //a>=b
     #else
         __m128i c128, as, bs, m1, m2;
         c128 = _mm_set1_epi8 (128);
@@ -2802,13 +5180,14 @@ _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b)         // VCGE
     #endif
 }
 
-uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
-{         //no unsigned shorts comparison, only signed available,so need the trick
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+{
+    //no unsigned shorts comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu16(a, b);
-        return _mm_cmpeq_epi16(cmp, a);         //a>=b
+        return _mm_cmpeq_epi16(cmp, a); //a>=b
     #else
         __m128i c8000, as, bs, m1, m2;
         c8000 = _mm_set1_epi16 (0x8000);
@@ -2820,13 +5199,14 @@ _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b)         // VCG
     #endif
 }
 
-uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
-{         //no unsigned ints comparison, only signed available,so need the trick
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+{
+    //no unsigned ints comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu32(a, b);
-        return _mm_cmpeq_epi32(cmp, a);         //a>=b
+        return _mm_cmpeq_epi32(cmp, a); //a>=b
     #else
         //serial solution may be faster
         __m128i c80000000, as, bs, m1, m2;
@@ -2843,34 +5223,78 @@ _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b)         // VCG
 //***************************************************************************************
 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
 
-uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcleq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcleq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcleq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmple_ps(_pM128(a),_pM128(b));
+    return64f(res);
+}
+
+uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+#define vcle_u8(a,b) vcge_u8(b,a)
+
+
+uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+#define vcle_u16(a,b) vcge_u16(b,a)
+
+
+uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+#define vcle_u32(a,b) vcge_u32(b,a)
+
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
 {
     __m128i c1, res;
-    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
     res = _mm_cmpgt_epi8 ( a,  b);
-    return _mm_andnot_si128 (res, c1);         //inverse the cmpgt result, get less-than-or-equal
+    return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
 }
 
-uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
 {
     __m128i c1, res;
-    c1 = _mm_cmpeq_epi16 (a,a);         //all ones 0xff....
+    c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
     res = _mm_cmpgt_epi16 ( a,  b);
     return _mm_andnot_si128 (res, c1);
 }
 
-uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
 {
     __m128i c1, res;
-    c1 = _mm_cmpeq_epi32 (a,a);         //all ones 0xff....
+    c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
     res = _mm_cmpgt_epi32 ( a,  b);
     return _mm_andnot_si128 (res, c1);
 }
 
-uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
@@ -2878,67 +5302,129 @@ _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
-    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
-
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu8(a, b);
-        return _mm_cmpeq_epi8(cmp, a);         //a<=b
+        return _mm_cmpeq_epi8(cmp, a); //a<=b
     }
 #else
     #define vcleq_u8(a,b) vcgeq_u8(b,a)
 #endif
 
-uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
+
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
-    {         //no unsigned shorts comparison in SSE, only signed available,so need the trick
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+    {
+        //no unsigned shorts comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu16(a, b);
-        return _mm_cmpeq_epi16(cmp, a);         //a<=b
+        return _mm_cmpeq_epi16(cmp, a); //a<=b
     }
 #else
     #define vcleq_u16(a,b) vcgeq_u16(b,a)
 #endif
 
-uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
-    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu32(a, b);
-        return _mm_cmpeq_epi32(cmp, a);         //a<=b
+        return _mm_cmpeq_epi32(cmp, a); //a<=b
     }
 #else
 //solution may be not optimal compared with the serial one
     #define vcleq_u32(a,b) vcgeq_u32(b,a)
 #endif
 
+
 //****** Vector compare greater-than ******************************************
 //**************************************************************************
+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
+}
 
-uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+
+uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
 #define vcgtq_s8 _mm_cmpgt_epi8
 
-uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
 #define vcgtq_s16 _mm_cmpgt_epi16
 
-uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
 #define vcgtq_s32 _mm_cmpgt_epi32
 
-uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
-    res = _mm_cmpgt_ps(a,b);         //use only 2 first entries
+    res = _mm_cmpgt_ps(a,b); //use only 2 first entries
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b)         // VCGT.U8 q0, q0, q0
-{         //no unsigned chars comparison, only signed available,so need the trick
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
     __m128i c128, as, bs;
     c128 = _mm_set1_epi8 (128);
     as = _mm_sub_epi8(a,c128);
@@ -2946,9 +5432,10 @@ _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b)         // VCGT
     return _mm_cmpgt_epi8 (as, bs);
 }
 
-uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b)         // VCGT.s16 q0, q0, q0
-{         //no unsigned short comparison, only signed available,so need the trick
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
+{
+    //no unsigned short comparison, only signed available,so need the trick
     __m128i c8000, as, bs;
     c8000 = _mm_set1_epi16 (0x8000);
     as = _mm_sub_epi16(a,c8000);
@@ -2956,9 +5443,10 @@ _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b)         // VCG
     return _mm_cmpgt_epi16 ( as, bs);
 }
 
-uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b)         // VCGT.U32 q0, q0, q0
-{         //no unsigned int comparison, only signed available,so need the trick
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
+{
+    //no unsigned int comparison, only signed available,so need the trick
     __m128i c80000000, as, bs;
     c80000000 = _mm_set1_epi32 (0x80000000);
     as = _mm_sub_epi32(a,c80000000);
@@ -2968,33 +5456,68 @@ _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b)         // VCG
 
 //********************* Vector compare less-than **************************
 //*************************************************************************
+uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
+
+
+uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
+
+
+uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+#define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
+
+
+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
 
-uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-#define vcltq_s8(a,b) vcgtq_s8(b, a)         //swap the arguments!!
+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
 
-uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-#define vcltq_s16(a,b) vcgtq_s16(b, a)         //swap the arguments!!
+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
 
-uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-#define vcltq_s32(a,b) vcgtq_s32(b, a)         //swap the arguments!!
+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
 
-uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-#define vcltq_f32(a,b) vcgtq_f32(b, a)         //swap the arguments!!
+uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
 
-uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-#define vcltq_u8(a,b) vcgtq_u8(b, a)         //swap the arguments!!
+uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
 
-uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
-#define vcltq_u16(a,b) vcgtq_u16(b, a)         //swap the arguments!!
+uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
 
-uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
-#define vcltq_u32(a,b) vcgtq_u32(b, a)         //swap the arguments!!
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
+
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
+
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
+
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
 
 //*****************Vector compare absolute greater-than or equal ************
 //***************************************************************************
+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3007,9 +5530,21 @@ _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b)         //
 
 //********Vector compare absolute less-than or equal ******************
 //********************************************************************
+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3022,9 +5557,21 @@ _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b)         //
 
 //********  Vector compare absolute greater-than    ******************
 //******************************************************************
+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3037,9 +5584,21 @@ _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b)         //
 
 //***************Vector compare absolute less-than  ***********************
 //*************************************************************************
+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3048,7 +5607,6 @@ _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b)         //
     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
     a0 = _mm_cmplt_ps (a0, b0);
     return (*(__m128i*)&a0);
-
 }
 
 //*************************Vector test bits************************************
@@ -3058,90 +5616,169 @@ with the corresponding element of a second vector. If the result is not zero, th
 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
 all zeros. */
 
-uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b)         // VTST.8 q0, q0, q0
+uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vtstq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vtstq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vtstq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_u8 vtst_s8
+
+uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
+#define vtst_u16 vtst_s16
+
+uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
+#define vtst_u32 vtst_s32
+
+
+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_p8 vtst_u8
+
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi8 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b)         // VTST.16 q0, q0, q0
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi16 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b)         // VTST.32 q0, q0, q0
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi32 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
 #define vtstq_u8 vtstq_s8
 
-uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
 #define vtstq_u16 vtstq_s16
 
-uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
 #define vtstq_u32 vtstq_s32
 
-uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
 #define vtstq_p8 vtstq_u8
 
 //****************** Absolute difference ********************
 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
 //************************************************************
-#if defined(USE_SSSE3)
+int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vabdq_s8(_pM128i(a), _pM128i(b)));
+}
 
-#endif
+int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vabdq_s16(_pM128i(a), _pM128i(b)));
+}
+
+int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vabdq_s32(_pM128i(a), _pM128i(b)));
+}
+
+uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vabdq_u8(_pM128i(a), _pM128i(b)));
+}
+
+uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vabdq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vabdq_u32(_pM128i(a), _pM128i(b)));
+}
+
+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vabdq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b)         // VABD.S8 q0,q0,q0
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi8 (a, b);
     return _mm_abs_epi8 (res);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b)         // VABD.S16 q0,q0,q0
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi16 (a,b);
     return _mm_abs_epi16 (res);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b)         // VABD.S32 q0,q0,q0
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi32 (a,b);
     return _mm_abs_epi32 (res);
 }
-#endif
 
-uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b)         //no abs for unsigned
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
 {
     __m128i cmp, difab, difba;
     cmp = vcgtq_u8(a,b);
@@ -3152,7 +5789,7 @@ _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b)         //no ab
     return _mm_or_si128(difab, difba);
 }
 
-uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.s16 q0,q0,q0
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
 {
     __m128i cmp, difab, difba;
@@ -3164,7 +5801,7 @@ _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
     return _mm_or_si128(difab, difba);
 }
 
-uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
 {
     __m128i cmp, difab, difba;
@@ -3176,8 +5813,8 @@ _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
     return _mm_or_si128(difab, difba);
 }
 
-float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b)         // VABD.F32 q0,q0,q0
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
 {
     __m128i c1;
     __m128 res;
@@ -3188,41 +5825,126 @@ _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b)         //
 
 //************  Absolute difference - long **************************
 //********************************************************************
+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return vabdq_s16(a16, b16);
+
+}
+
+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return vabdq_s32(a32, b32);
+}
+
+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no optimal SIMD solution, serial looks faster
+    _NEON2SSE_ALIGN_16 int64_t res[2];
+    if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
+    else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
+    if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
+    else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
+{
+    __m128i res;
+    res = vsubl_u8(a,b);
+    return _mm_abs_epi16(res);
+}
+
+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
+{
+    __m128i res;
+    res = vsubl_u16(a,b);
+    return _mm_abs_epi32(res);
+}
+
+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
+    else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
+    if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
+    else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
+    return _mm_load_si128((__m128i*)res);
+}
 
 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
 //*********************************************************************************************
+int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
+{
+    int32x2_t res64;
+    return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+#define vaba_u8 vaba_s8
+
+
+uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
+#define vaba_u16 vaba_s16
+
+uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
+{
+    uint32x2_t res64;
+    return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VABA.S8 q0,q0,q0
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
 {
     int8x16_t sub;
     sub = vabdq_s8(b, c);
     return vaddq_s8( a, sub);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VABA.S16 q0,q0,q0
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
 {
     int16x8_t sub;
     sub = vabdq_s16(b, c);
     return vaddq_s16( a, sub);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VABA.S32 q0,q0,q0
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
 {
     int32x4_t sub;
     sub = vabdq_s32(b, c);
     return vaddq_s32( a, sub);
 }
-#endif
 
-uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
     uint8x16_t sub;
@@ -3230,7 +5952,7 @@ _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
     return vaddq_u8( a, sub);
 }
 
-uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.s16 q0,q0,q0
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
 {
     uint16x8_t sub;
@@ -3238,7 +5960,7 @@ _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
     return vaddq_u16( a, sub);
 }
 
-uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
 {
     uint32x4_t sub;
@@ -3248,84 +5970,411 @@ _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
 
 //************** Absolute difference and accumulate - long ********************************
 //*************************************************************************************
+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_s32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_u32(b,c);
+    return _mm_add_epi64(a, res);
+}
 
 //***********************************************************************************
 //****************  Maximum and minimum operations **********************************
 //***********************************************************************************
 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
 //***********************************************************************************
+int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
-#define vmaxq_s8 _MM_MAX_EPI8         //SSE4.1
+int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
+}
 
-int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
+
+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
+
+int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
 #define vmaxq_s16 _mm_max_epi16
 
-int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
-#define vmaxq_s32 _MM_MAX_EPI32         //SSE4.1
+int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
 
-uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
+uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
 #define vmaxq_u8 _mm_max_epu8
 
-uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.s16 q0,q0,q0
-#define vmaxq_u16 _MM_MAX_EPU16         //SSE4.1
+uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
 
-uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
-#define vmaxq_u32 _MM_MAX_EPU32         //SSE4.1
+uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
 
-float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
 #define vmaxq_f32 _mm_max_ps
 
 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
 //***********************************************************************************************************
+int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
-#define vminq_s8 _MM_MIN_EPI8         //SSE4.1
 
-int16x8_t   vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
+
+int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
 #define vminq_s16 _mm_min_epi16
 
-int32x4_t   vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
-#define vminq_s32 _MM_MIN_EPI32         //SSE4.1
+int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
 
-uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
+uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
 #define vminq_u8 _mm_min_epu8
 
-uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.s16 q0,q0,q0
-#define vminq_u16 _MM_MIN_EPU16         //SSE4.1
+uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
 
-uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
-#define vminq_u32 _MM_MIN_EPU32         //SSE4.1
+uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
 
-float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
 #define vminq_f32 _mm_min_ps
 
 //*************  Pairwise addition operations. **************************************
 //************************************************************************************
 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    int8x8_t res64;
+    __m128i a16, b16, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
+    return64(res);
+}
+
+int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    uint8x8_t res64;
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    __m128i mask8, a16, b16, res;
+    mask8 = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_and_si128(res, mask8); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use low 64 bits
+    return64(res);
+}
+
+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
+    uint16x4_t res64;
+    __m128i c32767,  cfffe, as, bs, res;
+    c32767 = _mm_set1_epi16 (32767);
+    cfffe = _mm_set1_epi16 (0xfffe);
+    as = _mm_sub_epi16 (_pM128i(a), c32767);
+    bs = _mm_sub_epi16 (_pM128i(b), c32767);
+    res = _mm_hadd_epi16 (as, bs);
+    res = _mm_add_epi16 (res, cfffe);
+    res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
+{
+    //hadd doesn't work for unsigned values
+    uint32x2_t res64;
+    __m128i ab, ab_sh, res;
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
+    ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
+    res = _mm_add_epi32(ab, ab_sh);
+    res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 hadd128;
+    __m64_128 res64;
+    hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
+    hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
+    _M64f(res64, hadd128);
+    return res64;
+}
+
 
 //**************************  Long pairwise add  **********************************
 //*********************************************************************************
 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
 // and places the final results in the destination vector.
 
-#if defined(USE_SSSE3)
-int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a)         // VPADDL.S8 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit anyway
+    __m128i a16;
+    int16x4_t res64;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
+    return64(a16);
+}
+
+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    int32x2_t res64;
+    __m128i r32_1;
+    r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
+    r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
+    return64(r32_1);
+}
+
+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
+    return res;
+}
+
+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    uint16x4_t res64;
+    __m128i a16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
+    return64(a16);
+}
+
+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than a SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
+    res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
+    return res;
+}
+
+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
+    return res;
+}
+
+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r16_1, r16_2;
-    r16_1 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
+    r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
     //swap hi and low part of r to process the remaining data
     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
     return _mm_hadd_epi16 (r16_1, r16_2);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a)         // VPADDL.S16 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r32_1, r32_2;
     r32_1 = _MM_CVTEPI16_EPI32(a);
     //swap hi and low part of r to process the remaining data
@@ -3333,10 +6382,9 @@ _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a)         // VPADDL.S16 q0,q0
     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
     return _mm_hadd_epi32 (r32_1, r32_2);
 }
-#endif
 
-int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)         // VPADDL.S32 q0,q0
+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t atmp[4];
     _NEON2SSE_ALIGN_16 int64_t res[2];
@@ -3346,10 +6394,10 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a
     return _mm_load_si128((__m128i*)res);
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a)         // VPADDL.U8 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r16_1, r16_2;
     r16_1 = _MM_CVTEPU8_EPI16(a);
     //swap hi and low part of r to process the remaining data
@@ -3357,11 +6405,11 @@ _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a)         // VPADDL.U8 q0,q0
     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
     return _mm_hadd_epi16 (r16_1, r16_2);
 }
-#endif
 
-uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.s16 q0,q0
+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
-{         //serial solution looks faster than a SIMD one
+{
+    //serial solution looks faster than a SIMD one
     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
     _NEON2SSE_ALIGN_16 uint32_t res[4];
     _mm_store_si128((__m128i*)atmp, a);
@@ -3372,7 +6420,7 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t
     return _mm_load_si128((__m128i*)res);
 }
 
-uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
@@ -3387,28 +6435,69 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t
 //****************************************************************************************
 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
 // and accumulates the  values of the results into the elements of the destination (wide) vector
+int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
+{
+    int16x4_t res64;
+    return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
+}
+
+int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
+{
+    int32x2_t res64;
+    return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
+    return res;
+}
 
-#if defined(USE_SSSE3)
-int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b)         // VPADAL.S8 q0,q0
+uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
+{
+    uint16x4_t res64;
+    return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
+{
+    uint32x2_t res64;
+    return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
+    return res;
+}
+
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
 {
     int16x8_t pad;
     pad = vpaddlq_s8(b);
     return _mm_add_epi16 (a, pad);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b)         // VPADAL.S16 q0,q0
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
 {
     int32x4_t pad;
     pad = vpaddlq_s16(b);
     return _mm_add_epi32(a, pad);
 }
-#endif
 
-int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
 {
     int64x2_t pad;
@@ -3416,82 +6505,376 @@ _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
     return _mm_add_epi64 (a, pad);
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b)         // VPADAL.U8 q0,q0
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
 {
     uint16x8_t pad;
     pad = vpaddlq_u8(b);
     return _mm_add_epi16 (a, pad);
 }
-#endif
 
-uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.s16 q0,q0
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     uint32x4_t pad;
     pad = vpaddlq_u16(b);
     return _mm_add_epi32(a, pad);
-}         //no optimal SIMD solution, serial is faster
+} //no optimal SIMD solution, serial is faster
 
-uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         //no optimal SIMD solution, serial is faster
+{
+    //no optimal SIMD solution, serial is faster
     uint64x2_t pad;
     pad = vpaddlq_u32(b);
     return _mm_add_epi64(a, pad);
-}         //no optimal SIMD solution, serial is faster
+} //no optimal SIMD solution, serial is faster
 
 //**********  Folding maximum   *************************************
 //*******************************************************************
 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
 //and copies the larger of each pair into the corresponding element in the destination
 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max); //we need 64 bits only
+}
+
+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _mm_max_epi16 (ab, ab1);
+    max =  _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(max);
+}
+
+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _mm_max_epu8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max);
+}
+
+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _MM_MAX_EPU16 (ab, ab1);
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(max);
+}
+
+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+} //serial solution looks faster than a SIMD one
+
+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
 
 // ***************** Folding minimum  ****************************
 // **************************************************************
 //vpmin -> takes minimum of adjacent pairs
+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
+    min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
+    min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    min = _mm_min_epi16 (ab, ab1);
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(min);
+}
+
+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    min = _mm_min_epu8 (ab, ab1); // SSE4.1
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
+    min = _MM_MIN_EPU16 (ab, ab1);
+    min =    _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(min);
+}
+
+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
 
 //***************************************************************
 //***********  Reciprocal/Sqrt ************************************
 //***************************************************************
 //****************** Reciprocal estimate *******************************
-
 //the ARM NEON and x86 SIMD results may be slightly different
+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rcp_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!! No reciprocal for ints in IA32 available
+    uint32x2_t res;
+    float resf, r;
+    int i, q, s;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0x80000000) == 0) {
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
+            r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
+            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+            r =  (float)s / 256.0;
+            res.m64_u32[i] = r * (uint32_t)(1 << 31);
+        }
+    }
+    return res;
+}
 
-float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
-//the ARM NEON and x86 SIMD results may be slightly different
+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
 #define vrecpeq_f32 _mm_rcp_ps
 
-uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+
+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{         //no reciprocal for ints in IA32 available, neither for  unsigned int to float 4 lanes conversion, so serial solution looks faster
-    _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
+    _NEON2SSE_ALIGN_16 uint32_t res[4];
+   _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
+    float resf, r;
+    int i, q, s;
+  __m128i res128, mask, zero;
     _mm_store_si128((__m128i*)atmp, a);
-    res[0] = (atmp[0]) ? 1 / atmp[0] : 0xffffffff;
-    res[1] = (atmp[1]) ? 1 / atmp[1] : 0xffffffff;
-    return _mm_load_si128((__m128i*)res);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
+        q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
+        r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
+        s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+        r =  (float)s / 256.0;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c80000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
+    return _mm_or_si128(res128, mask);
 }
 
 //**********Reciprocal square root estimate ****************
 //**********************************************************
 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
+////the ARM NEON and x86 SIMD results may be slightly different
+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rsqrt_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
+   uint32x2_t res;
+   __m128 tmp;
+    float r, resf, coeff;
+    int i,q0, q1, s;;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+            q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
+            r = ((float)q0 + 0.5) / coeff;
+            tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
+            _mm_store_ss(&r, tmp);
+            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+            r = (float)s / 256.0;
+            res.m64_u32[i] = r * (((uint32_t)1) << 31);
+        }
+    }
+    return res;
+}
 
-float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
-//the ARM NEON and x86 SIMD results may be slightly different
+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
 #define vrsqrteq_f32 _mm_rsqrt_ps
 
-uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
-#define vrsqrteq_u32(a) _mm_castps_si128(_mm_rsqrt_ps(_M128(a)) )
-
+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
+   _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
+   _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
+   _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
+  __m128 tmp;
+  __m128i res128, mask, zero;
+    float r, resf, coeff;
+    int i,q0, q1, s;
+    _mm_store_si128((__m128i*)atmp, a);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
+        coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+        q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
+        r = ((float)q0 + 0.5) / coeff;
+        tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
+        _mm_store_ss(&r, tmp);
+        s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+        r = (float)s / 256.0;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
+    return _mm_or_si128(res128, mask);
+}
 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
 //******************************************************************************************
 //******VRECPS (Vector Reciprocal Step) ***************************************************
 //multiplies the elements of one vector by the corresponding elements of another vector,
 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
 
-float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b)         // VRECPS.F32 q0, q0, q0
+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vrecpsq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
 {
     __m128 f2, mul;
     f2 =  _mm_set1_ps(2.);
@@ -3503,8 +6886,17 @@ _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b)         /
 //multiplies the elements of one vector by the corresponding elements of another vector,
 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
 
-float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b)         // VRSQRTS.F32 q0, q0, q0
+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
+    res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
+    return res;
+}
+
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
 {
     __m128 f3, f05, mul;
     f3 =  _mm_set1_ps(3.);
@@ -3528,54 +6920,110 @@ _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b)
         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
+        else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
+        return res;
+
+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, i, 8)
+}
+
+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, i, 4)
+}
+
+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, i, 2)
+}
+
+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(64, i, 1)
+}
+
+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, u, 8)
+}
+
+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, u, 4)
+}
+
+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, u, 2)
+}
+
+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
+{
+    SERIAL_SHIFT_64(64, u, 1)
+}
+
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
 }
 
-int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
 }
 
-int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
 }
 
-int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
 }
 
-uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
 }
 
-uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.s16 q0,q0,q0
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
 }
 
-uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
 }
 
-uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
 }
 
+
 //*********** Vector saturating shift left: (negative values shift right) **********************
 //********************************************************************************************
 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
@@ -3612,54 +7060,134 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t
                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
+        return res;
+
+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
+}
+
+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
+}
+
+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
+}
+
+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
+}
+
+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
+}
+
+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
+}
+
+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
+}
+
+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
+}
+
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
 }
 
-int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
 }
 
-int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
 }
 
-int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
 }
 
-uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
 }
 
-uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.s16 q0,q0,q0
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
 }
 
-uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
 }
 
-uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
 }
 
+
 //******** Vector rounding shift left: (negative values shift right) **********
 //****************************************************************************
 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
@@ -3677,54 +7205,117 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t
                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
+
+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( b.m64_i ## TYPE[i] >= 0) { \
+            if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
+            else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
+        }else{ \
+            res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
+                            (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
+                            (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
+        return res;
+
+
+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,i,8)
+}
+
+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,i,4)
+}
+
+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,i,2)
+}
+
+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,i,1)
+}
+
+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,u,8)
+}
+
+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,u,4)
+}
+
+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,u,2)
+}
+
+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,u,1)
+}
+
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
 }
 
-int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
 }
 
-int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
 }
 
-int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
 }
 
-uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
 }
 
-uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.s16 q0,q0,q0
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
 }
 
-uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
 }
 
-uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
 }
 
+
 //********** Vector saturating rounding shift left: (negative values shift right) ****************
 //*************************************************************************************************
 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
@@ -3762,49 +7353,128 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t
                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
+        __m64_128 res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
+}
+
+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
+}
+
+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
+}
+
+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
+}
+
+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
+}
+
+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
+}
+
+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
+}
+
+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
+}
+
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
 }
 
-int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
 }
 
-int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
 }
 
-int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
 }
 
-uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
 }
 
-uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.s16 q0,q0,q0
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
 }
 
-uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
 }
 
-uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
@@ -3815,249 +7485,535 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t
 // *********************************************************************************
 //**************** Vector shift right by constant*************************************
 //************************************************************************************
+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srai_epi16 (r, b); //SSE2
+    r = _mm_packs_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(_mm_srai_epi16(_pM128i(a), b));
+}
+
+
+int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(_mm_srai_epi32(_pM128i(a), b));
+}
+
+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no arithmetic shift for 64bit values, serial solution used
+    int64x1_t res;
+    if(b>=64) res.m64_i64[0] = 0;
+    else res.m64_i64[0] = (*(int64_t*)&a) >> b;
+    return res;
+}
+
+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
 
-int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VSHR.S8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
+uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(_mm_srli_epi16(_pM128i(a), b));
+}
+
+
+uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(_mm_srli_epi32(_pM128i(a), b));
+}
+
+
+uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(_mm_srli_epi64(_pM128i(a), b));
+}
+
+
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
     __m128i zero, mask0, a_sign, r, a_sign_mask;
     _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
     zero = _mm_setzero_si128();
-    mask0 = _mm_set1_epi16(mask0_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
-    a_sign =  _mm_cmpgt_epi8 (zero, a);         //ff if a<0 or zero if a>0
+    mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
     r = _mm_srai_epi16 (a, b);
     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
     r =  _mm_andnot_si128 (mask0, r);
     return _mm_or_si128 (r, a_sign_mask);
 }
 
-int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
 #define vshrq_n_s16 _mm_srai_epi16
 
-int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
 #define vshrq_n_s32 _mm_srai_epi32
 
-int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{         //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
+{
+    //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
     __m128i c1, signmask,a0,  res64;
     _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
-    c1 =  _mm_cmpeq_epi32(a,a);         //0xffffffffffffffff
+    c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
     signmask  =  _mm_slli_epi64 (c1, (64 - b));
-    a0 = _mm_or_si128(a, *(__m128i*)mask);         //get the first bit
-    #ifdef USE_SSE4
-        a0 = _mm_cmpeq_epi64 (a, a0);         //SSE4.1
-    #else
-        a0 = _mm_cmpeq_epi32 (a, a0);
-        a0 = _mm_shuffle_epi32 (a0, 1 | (1 << 2) | (3 << 4) | (3 << 6));         //copy the information from hi to low part of the 64 bit data
-    #endif
+    a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
+    a0 = _MM_CMPEQ_EPI64 (a, a0);
     signmask = _mm_and_si128(a0, signmask);
     res64 = _mm_srli_epi64 (a, b);
     return _mm_or_si128(res64, signmask);
 }
 
-uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VSHR.U8 q0,q0,#8
-{         //no 8 bit shift available, need the special trick
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, need the special trick
     __m128i mask0, r;
     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
-    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
     r = _mm_srli_epi16 ( a, b);
     return _mm_and_si128 (r,  mask0);
 }
 
-uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.s16 q0,q0,#16
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
 #define vshrq_n_u16 _mm_srli_epi16
 
-uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
 #define vshrq_n_u32 _mm_srli_epi32
 
-uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
 #define vshrq_n_u64 _mm_srli_epi64
 
 //*************************** Vector shift left by constant *************************
 //*********************************************************************************
+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
+    return64(r);
+}
 
-int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
+{
+    int16x4_t res64;
+    return64(_mm_slli_epi16(_pM128i(a), b));
+}
+
+
+int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64(_mm_slli_epi32(_pM128i(a), b));
+}
+
+
+int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
+{
+    int64x1_t res64;
+    return64(_mm_slli_epi64(_pM128i(a), b));
+}
+
+
+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i mask8;
+    __m128i r;
+    mask8 = _mm_set1_epi16(0xff);
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_and_si128(r, mask8); //to avoid saturation
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+#define vshl_n_u16 vshl_n_s16
+
+
+uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+#define vshl_n_u32 vshl_n_s32
+
+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+#define vshl_n_u64 vshl_n_s64
+
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
 #define vshlq_n_s8 vshlq_n_u8
 
-int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
 #define vshlq_n_s16 _mm_slli_epi16
 
-int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
 #define vshlq_n_s32 _mm_slli_epi32
 
-int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 #define vshlq_n_s64 _mm_slli_epi64
 
-uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
-{         //no 8 bit shift available, need the special trick
+{
+    //no 8 bit shift available, need the special trick
     __m128i mask0, r;
     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
-    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
     r = _mm_slli_epi16 ( a, b);
     return _mm_and_si128 (r,  mask0);
 }
 
-uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
 #define vshlq_n_u16 vshlq_n_s16
 
-uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
 #define vshlq_n_u32 vshlq_n_s32
 
-uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 #define vshlq_n_u64 vshlq_n_s64
 
 //************* Vector rounding shift right by constant ******************
 //*************************************************************************
 //No corresponding  x86 intrinsics exist, need to do some tricks
+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r = _mm_packs_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(vrshrq_n_s16(_pM128i(a), b));
+}
+
+
+int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vrshrq_n_s32(_pM128i(a), b));
+}
+
+
+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution is faster
+    int64x1_t res;
+    int64_t a_i64 = *( int64_t*)&a;
+    if(b==64) {
+        res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
+    } else {
+        int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
+        res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
+    }
+    return res;
+}
+
+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
+    uint8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(vrshrq_n_u16(_pM128i(a), b));
+}
+
+
+uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(vrshrq_n_u32(_pM128i(a), b));
+}
+
+
+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(vrshrq_n_u64(_pM128i(a), b));
+}
 
-int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VRSHR.S8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
     __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
     r = vshrq_n_s8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
-    return _mm_add_epi8(r, maskb);         //actual rounding
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
 }
 
-int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
 {
     __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
     r = _mm_srai_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb);         //actual rounding
+    return _mm_add_epi16 (r, maskb); //actual rounding
 }
 
-int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
 {
     __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
     r = _mm_srai_epi32(a, b);
-    return _mm_add_epi32 (r, maskb);         //actual rounding
+    return _mm_add_epi32 (r, maskb); //actual rounding
 }
 
-int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{         //solution may be not optimal compared with a serial one
+{
+    //solution may be not optimal compared with a serial one
     __m128i maskb;
     int64x2_t r;
-    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
     r = vshrq_n_s64(a, b);
-    return _mm_add_epi64 (r, maskb);         //actual rounding
+    return _mm_add_epi64 (r, maskb); //actual rounding
 }
 
-uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VRSHR.U8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
     __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
     r = vshrq_n_u8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
-    return _mm_add_epi8(r, maskb);         //actual rounding
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
 }
 
-uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
 {
     __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
     r = _mm_srli_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb);         //actual rounding
+    return _mm_add_epi16 (r, maskb); //actual rounding
 }
 
-uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
 {
     __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
     r = _mm_srli_epi32(a, b);
-    return _mm_add_epi32 (r, maskb);         //actual rounding
+    return _mm_add_epi32 (r, maskb); //actual rounding
 }
 
-uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
-{         //solution may be not optimal compared with a serial one
+{
+    //solution may be not optimal compared with a serial one
     __m128i maskb,  r;
-    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
     r = _mm_srli_epi64(a, b);
-    return _mm_add_epi64 (r, maskb);         //actual rounding
+    return _mm_add_epi64 (r, maskb); //actual rounding
 }
 
 //************* Vector shift right by constant and accumulate *********
 //*********************************************************************
+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    //may be not optimal compared with a serial solution
+    int64x1_t shift;
+    shift = vshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
 
-int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRA.S8 q0,q0,#8
+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vshr_n_u64(b, c);
+    return vadd_u64(a, shift);
+}
+
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
 {
     int8x16_t shift;
     shift = vshrq_n_s8(b, c);
     return vaddq_s8(a, shift);
 }
 
-int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRA.S16 q0,q0,#16
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
 {
     int16x8_t shift;
     shift = vshrq_n_s16(b, c);
     return vaddq_s16(a, shift);
 }
 
-int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRA.S32 q0,q0,#32
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
 {
     int32x4_t shift;
     shift = vshrq_n_s32(b, c);
     return vaddq_s32(a, shift);
 }
 
-int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)         // VSRA.S64 q0,q0,#64
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
 {
     int64x2_t shift;
     shift = vshrq_n_s64(b, c);
     return vaddq_s64( a, shift);
 }
 
-uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VSRA.U8 q0,q0,#8
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
 {
     uint8x16_t shift;
     shift = vshrq_n_u8(b, c);
     return vaddq_u8(a, shift);
 }
 
-uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VSRA.s16 q0,q0,#16
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
 {
     uint16x8_t shift;
     shift = vshrq_n_u16(b, c);
     return vaddq_u16(a,  shift);
 }
 
-uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VSRA.U32 q0,q0,#32
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
 {
     uint32x4_t shift;
     shift = vshrq_n_u32(b, c);
     return vaddq_u32(a, shift);
 }
 
-uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
-_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)         // VSRA.U64 q0,q0,#64
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
 {
     uint64x2_t shift;
     shift = vshrq_n_u64(b, c);
@@ -4066,32 +8022,98 @@ _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange
 
 //************* Vector rounding shift right by constant and accumulate ****************************
 //************************************************************************************************
+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vrshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vrshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vrshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
 
-int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VRSRA.S8 q0,q0,#8
+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    int64x1_t shift;
+    shift = vrshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vrshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vrshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vrshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vrshr_n_u64(b, c);
+    return vadd_u64( a, shift);
+}
+
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
 {
     int8x16_t shift;
     shift = vrshrq_n_s8(b, c);
     return vaddq_s8(a, shift);
 }
 
-int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VRSRA.S16 q0,q0,#16
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
 {
     int16x8_t shift;
     shift = vrshrq_n_s16(b, c);
     return vaddq_s16(a, shift);
 }
 
-int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VRSRA.S32 q0,q0,#32
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
 {
     int32x4_t shift;
     shift = vrshrq_n_s32(b, c);
     return vaddq_s32(a, shift);
 }
 
-int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
 {
     int64x2_t shift;
@@ -4099,31 +8121,31 @@ _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1
     return vaddq_s64(a, shift);
 }
 
-uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VRSRA.U8 q0,q0,#8
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
 {
     uint8x16_t shift;
     shift = vrshrq_n_u8(b, c);
     return vaddq_u8(a, shift);
 }
 
-uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VRSRA.s16 q0,q0,#16
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
 {
     uint16x8_t shift;
     shift = vrshrq_n_u16(b, c);
     return vaddq_u16(a,  shift);
 }
 
-uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VRSRA.U32 q0,q0,#32
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
 {
     uint32x4_t shift;
     shift = vrshrq_n_u32(b, c);
     return vaddq_u32(a, shift);
 }
 
-uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
 {
     uint64x2_t shift;
@@ -4134,61 +8156,157 @@ _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrang
 //**********************Vector saturating shift left by constant *****************************
 //********************************************************************************************
 //we don't check const ranges  assuming they are met
+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    int8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
+    return64(r128);
+}
+
+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packs function)
+    int16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    //serial execution may be faster
+    int32x2_t res64;
+    return64(vqshlq_n_s32 (_pM128i(a), b));
+}
+
+
+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    int64x1_t res;
+    int64_t bmask;
+    int64_t a_i64 = *( int64_t*)&a;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    if (a_i64 >= bmask) {
+        res.m64_i64[0] = ~(_SIGNBIT64);
+    } else {
+        res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
+    }
+    return res;
+}
+
+
+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b); //shift_res
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
 
-int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHL.S8 q0,q0,#0
-{         // go to 16 bit to get the auto saturation (in packs function)
+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packus function)
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
+    return64(r128);
+}
+
+uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
+{
+    uint32x2_t res64;
+    return64(vqshlq_n_u32(_pM128i(a), b));
+}
+
+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    uint64x1_t res;
+    uint64_t bmask;
+    uint64_t a_i64 = *(uint64_t*)&a;
+    bmask = ( uint64_t)1 << (64 - b);
+    res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
+    return res;
+}
+
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packs_epi16 (r128_1, r128_2);         //saturated s8
+    return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
 }
 
-int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHL.S16 q0,q0,#0
-{         // manual saturation solution looks LESS optimal than 32 bits conversion one
-      // go to 32 bit to get the auto saturation (in packs function)
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    // go to 32 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI16_EPI32 (a128);
     r128_2 = _mm_slli_epi32 (a128, b);
-    return _mm_packs_epi32 (r128_1, r128_2);         //saturated s16
+    return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
 }
 
-int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHL.S32 q0,q0,#0
-{         // no 64 bit saturation option available, special tricks necessary
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
+{
+    // no 64 bit saturation option available, special tricks necessary
     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
-    c1 = _mm_cmpeq_epi32(a,a);         //0xff..ff
-    maskA = _mm_srli_epi32(c1, b + 1);         //mask for positive numbers (32-b+1) zeros and b-1 ones
-    saturation_mask = _mm_cmpgt_epi32 (a, maskA);         //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1);         //saturated to 0x7f..ff when needed and zeros if not
+    c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
+    maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
+    saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
     shift_res = _mm_slli_epi32 (a, b);
     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
     //result with positive numbers saturated
     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
     //treat negative numbers
-    maskA = _mm_slli_epi32(c1, 31 - b);         //mask for negative numbers b-1 ones  and (32-b+1)  zeros
-    saturation_mask = _mm_cmpgt_epi32 (maskA,a);         //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31);         //saturated to 0x80..00 when needed and zeros if not
+    maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
+    saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
 }
 
-int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here
+{
+    // no effective SIMD solution here
     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
     int64_t bmask;
     int i;
-    bmask = ( int64_t)1 << (63 - b);         //positive
+    bmask = ( int64_t)1 << (63 - b); //positive
     _mm_store_si128((__m128i*)atmp, a);
     for (i = 0; i<2; i++) {
         if (atmp[i] >= bmask) {
@@ -4200,110 +8318,158 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)         // VQSHL.U8 q0,q0,#0
-{         // go to 16 bit to get the auto saturation (in packs function)
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPU8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPU8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
 }
 
-uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.s16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b)         // VQSHL.s16 q0,q0,#0
-{         // manual saturation solution looks more optimal than 32 bits conversion one
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
+{
+    // manual saturation solution looks more optimal than 32 bits conversion one
     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
     c8000 = _mm_set1_epi16 (0x8000);
 //no unsigned shorts comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi16(a, c8000);         //go to signed
+    a_signed = _mm_sub_epi16(a, c8000); //go to signed
     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
     shift_res = _mm_slli_epi16 (a, b);
     return _mm_or_si128 (shift_res, saturation_mask);
 }
 
-uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b)         // VQSHL.U32 q0,q0,#0
-{         // manual saturation solution, no 64 bit saturation option, the serial version may be faster
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
+{
+    // manual saturation solution, no 64 bit saturation option, the serial version may be faster
     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
     c80000000 = _mm_set1_epi32 (0x80000000);
 //no unsigned ints comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi32(a, c80000000);         //go to signed
+    a_signed = _mm_sub_epi32(a, c80000000); //go to signed
     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
     shift_res = _mm_slli_epi32 (a, b);
     return _mm_or_si128 (shift_res, saturation_mask);
 }
 
-uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here
+{
+    // no effective SIMD solution here
     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
     uint64_t bmask;
     int i;
     bmask = ( uint64_t)1 << (64 - b);
     _mm_store_si128((__m128i*)atmp, a);
     for (i = 0; i<2; i++) {
-        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b;         //if b=0 we are fine with any a
+        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
     }
     return _mm_load_si128((__m128i*)res);
 }
 
 //**************Vector signed->unsigned saturating shift left by constant *************
 //*************************************************************************************
+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
+{
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64( vqshluq_n_s32(_pM128i(a), b));
+}
+
+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
+{
+    uint64x1_t res;
+    uint64_t limit;
+    if (a.m64_i64[0]<=0) {
+        res.m64_u64[0] = 0;
+    } else {
+        limit = (uint64_t) 1 << (64 - b);
+        res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
+    }
+    return res;
+}
 
-uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHLU.S8 q0,q0,#0
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
 {
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHLU.S16 q0,q0,#0
-{         // manual saturation solution looks LESS optimal than 32 bits conversion one
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI16_EPI32 (a128);
     r128_2 = _mm_slli_epi32 (a128, b);
-    return _MM_PACKUS_EPI32 (r128_1, r128_2);         //saturated s16
+    return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
 }
-#endif
 
-uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHLU.S32 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
     zero = _mm_setzero_si128();
     maskA = _mm_cmpeq_epi32(a, a);
-    maskA = _mm_slli_epi32(maskA,(32 - b));         // b ones and (32-b)zeros
+    maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
     //saturate negative numbers to zero
-    maskGT0   = _mm_cmpgt_epi32 (a, zero);         // //0xffffffff if positive number and zero otherwise (negative numbers)
-    a0 = _mm_and_si128 (a,  maskGT0);         //negative are zeros now
+    maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
+    a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
     //saturate positive to 0xffffffff
     a_masked = _mm_and_si128 (a0, maskA);
-    a_masked = _mm_cmpgt_epi32 (a_masked, zero);         //0xffffffff if saturation necessary 0 otherwise
+    a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
     a_shift = _mm_slli_epi32 (a0, b);
-    return _mm_or_si128 (a_shift, a_masked);         //actual saturation
+    return _mm_or_si128 (a_shift, a_masked); //actual saturation
 }
 
-uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here, serial execution looks faster
+{
+    // no effective SIMD solution here, serial execution looks faster
     _NEON2SSE_ALIGN_16 int64_t atmp[2];
     _NEON2SSE_ALIGN_16 uint64_t res[2];
     uint64_t limit;
@@ -4322,23 +8488,437 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_
 
 //************** Vector narrowing  shift right by constant **************
 //**********************************************************************
+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r32  = vshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_u64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
 //*********************************************************************************************
+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b);
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b);
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
+    }
+    return res;
+}
 
 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r16;
+    uint8x8_t res64;
+    r16 = vrshrq_n_s16(a,b);
+    r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r32;
+    uint16x4_t res64;
+    r32 = vrshrq_n_s32(a,b);
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
+    }
+    return res;
+}
 
 //***** Vector narrowing saturating shift right by constant ******
 //*****************************************************************
+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    res64[0] = (atmp[0] >> b);
+    res64[1] = (atmp[1] >> b);
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
 
 //********* Vector rounding narrowing shift right by constant *************************
 //****************************************************************************************
+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r16  = vrshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r32  = vrshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_u64(a,b);
+    r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************* Vector rounding narrowing saturating shift right by constant ************
 //****************************************************************************************
+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_s16(a,b);
+    r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
+    res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
+    maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
+    res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vrshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************** Vector widening shift left by constant ****************
 //************************************************************************
+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
+{
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    return _mm_slli_epi16 (r, b);
+}
+
+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi32 (r, b);
+}
+
+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi64 (r, b);
+}
+
+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
+{
+    //no uint8 to uint16 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi8(_pM128i(a), zero);
+    return _mm_slli_epi16 (r, b);
+}
+
+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
+{
+    //no uint16 to uint32 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi16(_pM128i(a), zero);
+    return _mm_slli_epi32 (r, b);
+}
+
+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
+{
+    //no uint32 to uint64 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi32(_pM128i(a), zero);
+    return _mm_slli_epi64 (r, b);
+}
 
 //************************************************************************************
 //**************************** Shifts with insert ************************************
@@ -4349,138 +8929,247 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_
 //**************** Vector shift right and insert ************************************
 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
 //All other bits are taken from b shifted.
+int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
+{
+    int8x8_t res64;
+    return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
 
-int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRI.8 q0,q0,#8
+int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
+{
+    int16x4_t res64;
+    return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
+{
+    int32x2_t res64;
+    return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+
+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    int64x1_t res;
+    if (c ==64)
+        res = a;
+    else{
+        res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
+    }
+    return res;
+}
+
+uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_u8 vsri_n_s8
+
+uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_u16 vsri_n_s16
+
+uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+#define vsri_n_u32 vsri_n_s32
+
+
+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+#define vsri_n_u64 vsri_n_s64
+
+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_p8 vsri_n_u8
+
+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_p16 vsri_n_u16
+
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
 {
     __m128i maskA, a_masked;
     uint8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};         //"a" bits mask, 0 bit not used
-    maskA = _mm_set1_epi8(maskLeft[c]);         // c ones and (8-c)zeros
+    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
+    maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
     a_masked = _mm_and_si128 (a, maskA);
-    b_shift = vshrq_n_u8( b, c);         // c zeros on the left in b due to logical shift
-    return _mm_or_si128 (a_masked, b_shift);         //combine (insert b into a)
+    b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
+    return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
 }
 
-int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRI.16 q0,q0,#16
-{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
     uint16x8_t b_shift;
     uint16x8_t a_c;
-    b_shift = vshrq_n_u16( b, c);         // c zeros on the left in b due to logical shift
+    b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
     a_c = vshrq_n_u16( a, (16 - c));
-    a_c  = _mm_slli_epi16(a_c, (16 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRI.32 q0,q0,#32
-{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
     uint32x4_t b_shift;
     uint32x4_t a_c;
-    b_shift = vshrq_n_u32( b, c);         // c zeros on the left in b due to logical shift
+    b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
     a_c = vshrq_n_u32( a, (32 - c));
-    a_c  = _mm_slli_epi32(a_c, (32 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-{         //serial solution may be faster
+{
+    //serial solution may be faster
     uint64x2_t b_shift;
     uint64x2_t a_c;
-    b_shift = _mm_srli_epi64(b, c);         // c zeros on the left in b due to logical shift
+    b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
     a_c = _mm_srli_epi64(a, (64 - c));
-    a_c  = _mm_slli_epi64(a_c, (64 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
 #define vsriq_n_u8 vsriq_n_s8
 
-uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 #define vsriq_n_u16 vsriq_n_s16
 
-uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
 #define vsriq_n_u32 vsriq_n_s32
 
-uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
 #define vsriq_n_u64 vsriq_n_s64
 
-poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
 #define vsriq_n_p8 vsriq_n_u8
 
-poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 #define vsriq_n_p16 vsriq_n_u16
 
 //***** Vector shift left and insert *********************************************
 //*********************************************************************************
 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
+int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
+{
+    int8x8_t res64;
+    return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
+
+int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
+{
+    int16x4_t res64;
+    return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
+}
 
-int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c)         // VSLI.8 q0,q0,#0
+
+int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
+{
+    int32x2_t res64;
+    return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
+    return res;
+}
+
+
+uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_u8 vsli_n_s8
+
+uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_u16 vsli_n_s16
+
+uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+#define vsli_n_u32 vsli_n_s32
+
+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+#define vsli_n_u64 vsli_n_s64
+
+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_p8 vsli_n_u8
+
+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_p16 vsli_n_u16
+
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
 {
     __m128i maskA, a_masked;
     int8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f};         //"a" bits mask
-    maskA = _mm_set1_epi8(maskRight[c]);         // (8-c)zeros and c ones
+    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
+    maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
     b_shift = vshlq_n_s8( b, c);
     a_masked = _mm_and_si128 (a, maskA);
-    return _mm_or_si128 (b_shift, a_masked);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
 }
 
-int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c)         // VSLI.16 q0,q0,#0
-{         //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
+{
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
     int16x8_t b_shift;
     int16x8_t a_c;
     b_shift = vshlq_n_s16( b, c);
     a_c = vshlq_n_s16( a, (16 - c));
     a_c  = _mm_srli_epi16(a_c, (16 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c)         // VSLI.32 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
-      //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
     int32x4_t b_shift;
     int32x4_t a_c;
     b_shift = vshlq_n_s32( b, c);
     a_c = vshlq_n_s32( a, (32 - c));
     a_c  = _mm_srli_epi32(a_c, (32 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c)         // VSLI.64 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
-      //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
     int64x2_t b_shift;
     int64x2_t a_c;
     b_shift = vshlq_n_s64( b, c);
     a_c = vshlq_n_s64( a, (64 - c));
     a_c  = _mm_srli_epi64(a_c, (64 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
 #define vsliq_n_u8 vsliq_n_s8
 
-uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 #define vsliq_n_u16 vsliq_n_s16
 
-uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
 #define vsliq_n_u32 vsliq_n_s32
 
-uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
 #define vsliq_n_u64 vsliq_n_s64
 
-poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
 #define vsliq_n_p8 vsliq_n_u8
 
-poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 #define vsliq_n_p16 vsliq_n_u16
 
 // ***********************************************************************************************
@@ -4496,31 +9185,31 @@ poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);
 #define LOAD_SI128(ptr) \
         ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
 
-uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_u8 LOAD_SI128
 
-uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_u16 LOAD_SI128
 
-uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 #define vld1q_u32 LOAD_SI128
 
-uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld1q_u64 LOAD_SI128
 
-int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_s8 LOAD_SI128
 
-int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_s16 LOAD_SI128
 
-int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 #define vld1q_s32 LOAD_SI128
 
-int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld1q_s64 LOAD_SI128
 
-float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
@@ -4528,114 +9217,292 @@ __m128 f2;
 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
 }*/
 
-float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
 {
-    if( (((unsigned long)(ptr)) & 15 ) == 0 )         //16 bits aligned
+    if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
         return _mm_load_ps(ptr);
     else
         return _mm_loadu_ps(ptr);
 }
 
-poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_p8  LOAD_SI128
 
-poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_p16 LOAD_SI128
 
+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
+
+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_u16 vld1_u8
+
+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_u32 vld1_u8
+
+
+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_u64 vld1_u8
+
+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_s8 vld1_u8
+
+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_s16 vld1_u16
+
+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_s32 vld1_u32
+
+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_s64 vld1_u64
+
+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
 
+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = *(ptr + 1);
+    return res;
+}
+
+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_p8 vld1_u8
+
+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_p16 vld1_u16
+
 //***********************************************************************************************************
 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
 //***********************************************************************************************************
-uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
 
-uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
-#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane);         // _p;
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
+
 
-int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
 
+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
-{         //we need to deal with  ptr  16bit NOT aligned case
+{
+    //we need to deal with  ptr  16bit NOT aligned case
     __m128 p;
     p = _mm_set1_ps(*(ptr));
     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
 }
 
-int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
 
-poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-//serial solution may be faster
+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    res = vec;
+    res.m64_u8[lane] = *(ptr);
+    return res;
+}
 
+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    res = vec;
+    res.m64_u16[lane] = *(ptr);
+    return res;
+}
+
+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res = vec;
+    res.m64_u32[lane] = *(ptr);
+    return res;
+}
+
+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+
+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
+
+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
+
+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
+
+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res = vec;
+    res.m64_f32[lane] = *(ptr);
+    return res;
+}
+
+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
+
+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_p8 vld1_lane_u8
+
+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_p16 vld1_lane_s16
+
 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
 // ******************************************************************************************************************
-uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
 
-uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
 
-uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
 
-uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
 {
     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
     return LOAD_SI128(val);
 }
 
-int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
 
-int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
 
-int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
 
-int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
 
-float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
 //current IA SIMD doesn't support float16, need to go to 32 bits
 
-float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
 
-poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
 
-poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
 
+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for(i = 0; i<8; i++) {
+        res.m64_u8[i] =  *(ptr);
+    }
+    return res;
+}
+
+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for(i = 0; i<4; i++) {
+        res.m64_u16[i] =  *(ptr);
+    }
+    return res;
+}
+
+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = *(ptr);
+    res.m64_u32[1] = *(ptr);
+    return res;
+}
+
+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
+
+
+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
+
+
+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
+
+
+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
+
+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
 //current IA SIMD doesn't support float16
 
+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = res.m64_f32[0];
+    return res; // use last 64bits only
+}
+
+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_p8 vld1_dup_u8
+
+
+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_p16 vld1_dup_u16
+
+
 //*************************************************************************************
 //********************************* Store **********************************************
 //*************************************************************************************
@@ -4644,80 +9511,148 @@ poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD
 #define STORE_SI128(ptr, val) \
         (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
 
-void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_u8 STORE_SI128
 
-void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_u16 STORE_SI128
 
-void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
 #define vst1q_u32 STORE_SI128
 
-void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
 #define vst1q_u64 STORE_SI128
 
-void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_s8 STORE_SI128
 
-void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_s16 STORE_SI128
 
-void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
 #define vst1q_s32 STORE_SI128
 
-void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
 #define vst1q_s64 STORE_SI128
 
-void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently
 
-void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
 {
-    if( ((unsigned long)(ptr) & 15)  == 0 )         //16 bits aligned
+    if( ((unsigned long)(ptr) & 15)  == 0 ) //16 bits aligned
         _mm_store_ps (ptr, val);
     else
         _mm_storeu_ps (ptr, val);
 }
 
-void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_p8  vst1q_u8
 
-void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_p16 vst1q_u16
 
+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
+{
+    int i;
+    for (i = 0; i<8; i++) {
+        *(ptr + i) = ((uint8_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
+{
+    int i;
+    for (i = 0; i<4; i++) {
+        *(ptr + i) = ((uint16_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
+{
+    int i;
+    for (i = 0; i<2; i++) {
+        *(ptr + i) = ((uint32_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
+{
+    *(ptr) = *((uint64_t*)&val);
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
+
+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
+
+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
+
+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
+
+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
 //current IA SIMD doesn't support float16
 
+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
+{
+    *(ptr) =   val.m64_f32[0];
+    *(ptr + 1) = val.m64_f32[1];
+    return;
+}
+
+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_p8 vst1_u8
+
+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_p16 vst1_u16
+
 //***********Store a lane of a vector into memory (extract given lane) *********************
 //******************************************************************************************
-void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
 
-void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
 
-void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
 
-void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
 
-void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
 
-void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
 
-void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
 
-void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
 
-void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
 {
     int32_t ilane;
@@ -4725,22 +9660,73 @@ _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x
     *(ptr) =  *((float*)&ilane);
 }
 
-void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_p8   vst1q_lane_u8
 
-void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_p16   vst1q_lane_s16
 
+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
+{
+    *(ptr) = val.m64_u8[lane];
+}
+
+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
+{
+    *(ptr) = val.m64_u16[lane];
+}
+
+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_u32[lane];
+}
+
+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
+{
+    *(ptr) = val.m64_u64[0];
+}
+
+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
+
+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
+
+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
+
+
+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
+
+
+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_f32[lane];
+}
+
+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1_lane_p8 vst1_lane_u8
+
+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_p16 vst1_lane_s16
+
 //***********************************************************************************************
 //**************** Loads and stores of an N-element structure **********************************
 //***********************************************************************************************
 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
 //****************** 2 elements load  *********************************************
-uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr)         // VLD2.8 {d0, d2}, [r0]
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
 {
     uint8x16x2_t v;
     v.val[0] = vld1q_u8(ptr);
@@ -4749,9 +9735,8 @@ _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr)
     return v;
 }
 
-#if defined(USE_SSSE3)
-uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr)         // VLD2.16 {d0, d2}, [r0]
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
 {
     uint16x8x2_t v;
     v.val[0] = vld1q_u16( ptr);
@@ -4759,10 +9744,9 @@ _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr)
     v = vuzpq_s16(v.val[0], v.val[1]);
     return v;
 }
-#endif
 
-uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
 {
     uint32x4x2_t v;
     v.val[0] = vld1q_u32 ( ptr);
@@ -4774,19 +9758,18 @@ _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr)
 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
 
-#if defined(USE_SSSE3)
-int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
-#endif
 
-int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
 
-float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
+
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
 {
     float32x4x2_t v;
     v.val[0] =  vld1q_f32 (ptr);
@@ -4795,114 +9778,106 @@ _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr
     return v;
 }
 
-poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
 #define  vld2q_p8 vld2q_u8
 
-#if defined(USE_SSSE3)
-poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
 #define vld2q_p16 vld2q_u16
-#endif
 
-#if defined(USE_SSSE3)
-uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
 {
     uint8x8x2_t v;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
     __m128i ld128;
-    ld128 = vld1q_u8(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
+    ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
+    vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
 {
-    uint16x4x2_t v;
+    _NEON2SSE_ALIGN_16 uint16x4x2_t v;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
     __m128i ld128;
-    ld128 = vld1q_u16(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
+    vst1q_u16((v.val), ld128);
     return v;
 }
-#endif
 
-uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
 {
-    uint32x2x2_t v;
+    _NEON2SSE_ALIGN_16 uint32x2x2_t v;
     __m128i ld128;
-    ld128 = vld1q_u32(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
+    vst1q_u32((v.val), ld128);
     return v;
 }
 
-uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
 {
     uint64x1x2_t v;
-    v.val[0] = vld1q_u64(ptr);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
 
-int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
-#endif
 
-int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
 
-int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
 
-float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr);         // VLD2.16 {d0, d1}, [r0]
+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
 
-float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
 {
     float32x2x2_t v;
-    v.val[0] = vld1q_f32(ptr);
-    v.val[0] = _mm_shuffle_ps(v.val[0], v.val[0], _MM_SHUFFLE(3,1, 2, 0));
-    v.val[1] = _mm_movehl_ps(v.val[0],v.val[0]);
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 2);
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 3);
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 #define vld2_p8 vld2_u8
 
-poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 #define vld2_p16 vld2_u16
-#endif
 
 //******************** Triplets ***************************************
 //*********************************************************************
-#if defined(USE_SSSE3)
-uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr)         // VLD3.8 {d0, d2, d4}, [r0]
-{  //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
-   //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
-   //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
-   //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
+    //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
+    //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
+    //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
     uint8x16x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
     _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
 
-    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, ...a15
-    v.val[1] =  vld1q_u8 ((ptr + 16));	//b0,b1,b2,b3...b7, ...b15
-    v.val[2] =  vld1q_u8 ((ptr + 32));  //c0,c1,c2,c3,...c7,...c15
+    v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
+    v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
+    v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
 
     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
@@ -4913,43 +9888,42 @@ _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr)
     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
-    v.val[0] = _mm_or_si128(v.val[0],tmp3) ;//a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
 
-    tmp3 = _mm_slli_si128(tmp0, 5);//0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
+    tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
-    v.val[1] = _mm_slli_si128(v.val[1], 5);//0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
-    v.val[1] =	_mm_slli_si128(v.val[1],5);//0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
-    v.val[1] = _mm_srli_si128(v.val[1], 5);//a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
+    v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
-    tmp3 = _mm_slli_si128(tmp3,11);//0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
+    tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
 
     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr)         // VLD3.16 {d0, d2, d4}, [r0]
-{  //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
     uint16x8x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
     _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
     _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
     _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
 
-    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,...a7,
-    v.val[1] =  vld1q_u16 ((ptr + 8));	//b0,b1,b2,b3...b7
-    v.val[2] =  vld1q_u16 ((ptr + 16));  //c0,c1,c2,c3,...c7
+    v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
+    v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
+    v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
 
     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
@@ -4960,38 +9934,38 @@ _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr)
     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
-    v.val[0] = _mm_or_si128(v.val[0],tmp3);//a0,a3,a6,b1,b4,b7,c2,c5
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
 
-    tmp3 = _mm_slli_si128(tmp0, 4);//0,0,a0,a3,a6,a1,a4,a7
+    tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a1,a4,a7,b2,b5,b0,b3,b6,
-    v.val[1] =	_mm_slli_si128(v.val[1],6);//0,0,0,a1,a4,a7,b2,b5,
-    v.val[1] = _mm_srli_si128(v.val[1], 6);//a1,a4,a7,b2,b5,0,0,0,
-    tmp3 = _mm_srli_si128(tmp2,4);  //c0,c3,c6, c1,c4,c7,0,0
-    tmp3 = _mm_slli_si128(tmp3,10);  //0,0,0,0,0,c0,c3,c6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
+    v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
+    v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
+    tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
 
     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],4);//0,0, b0,b3,b6,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0, b0,b3,b6,c1,c4,c7,
+    v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a2,a5,b0,b3,b6,c1,c4,c7,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
     return v;
 }
-#endif
 
-uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
-{//a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     uint32x4x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,a2,a3,
-    v.val[1] =  vld1q_u32 ((ptr + 4));	//b0,b1,b2,b3
-    v.val[2] =  vld1q_u32 ((ptr + 8));  //c0,c1,c2,c3,
+    v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
 
     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
@@ -5006,28 +9980,27 @@ _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr)
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
 
-int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
-#endif
 
-int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
 
-float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
-{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     float32x4x3_t v;
     __m128 tmp0, tmp1,tmp2, tmp3;
-    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,a2,a3,
-    v.val[1] =  vld1q_f32 ((ptr + 4));	//b0,b1,b2,b3
-    v.val[2] =  vld1q_f32 ((ptr + 8));  //c0,c1,c2,c3,
+    v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
 
     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
@@ -5042,162 +10015,171 @@ _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * pt
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
 #define vld3q_p8 vld3q_u8
 
-poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 #define vld3q_p16 vld3q_u16
-#endif
 
-#if defined(USE_SSSE3)
-uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr)         // VLD3.8 {d0, d1, d2}, [r0]
-{ //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
     uint8x8x3_t v;
-    __m128i tmp0, tmp1;
+    __m128i val0, val1, val2, tmp0, tmp1;
     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
-    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
-
-    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
-    tmp1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
-    v.val[0] = _mm_slli_si128(tmp0,10);
-    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(tmp1,6);//0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
-    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
-
-    v.val[1] = _mm_slli_si128(tmp0,5);  //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
-    v.val[1] = _mm_srli_si128(v.val[1],11);  //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
-    v.val[1] = _mm_or_si128(v.val[1],v.val[2]) ;//a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
-
-    tmp0 = _mm_srli_si128(tmp0,11);  //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c1,c4,c7,
-    v.val[2] = _mm_or_si128(tmp0, v.val[2]) ;//a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
+
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
+    tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
+    val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
+    _M64(v.val[0], val0);
+    val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
+    val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
+    _M64(v.val[1], val1);
+
+    tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
+    val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    _M64(v.val[2], val2);
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr)         // VLD3.16 {d0, d1, d2}, [r0]
-{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     uint16x4x3_t v;
-    __m128i tmp0, tmp1;
+    __m128i val0, val1, val2, tmp0, tmp1;
     _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,  b0,b1,b2,b3
-
-    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
-    tmp1 = _mm_shufflelo_epi16(v.val[2], 201); //11 00 10 01     : c1, c2, c0, c3,
-    v.val[0] = _mm_slli_si128(tmp0,10);
-    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0, a3, b2, 0,0, 0,0,
-    v.val[2] = _mm_slli_si128(tmp1,14);//0,0,0,0,0,0,0,c1
-    v.val[2] = _mm_srli_si128(v.val[2],8);//0,0,0,c1,0,0,0,0
-    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0, a3, b2, c1, x,x,x,x
-
-    v.val[1] = _mm_slli_si128(tmp0,4);  //0,0,0,0,0,a1, b0, b3
-    v.val[1] = _mm_srli_si128(v.val[1],10);  //a1, b0, b3, 0,0, 0,0,
-    v.val[2] = _mm_srli_si128(tmp1,2);//c2, 0,0,0,0,0,0,0,
-    v.val[2] = _mm_slli_si128(v.val[2],6);//0,0,0,c2,0,0,0,0
-    v.val[1] = _mm_or_si128(v.val[1],v.val[2]); //a1, b0, b3, c2, x,x,x,x
+    val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
+
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
+    tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
+    val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
+    val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
+    val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
+    _M64(v.val[0], val0);
+
+    val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
+    val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
+    val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
+    _M64(v.val[1], val1);
 
     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
     tmp1 = _mm_srli_si128(tmp1,4);
-    tmp1 = _mm_slli_si128(tmp1,4);  //0,0,c0, c3,
-    v.val[2] = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
+    val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    _M64(v.val[2], val2);
     return v;
 }
-#endif
 
-uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr)         // VLD3.32 {d0, d1, d2}, [r0]
-{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
     uint32x2x3_t v;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
-
-    v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6));  //a0,b1, a1, b0
-    v.val[2] =  _mm_slli_si128(v.val[2], 8);  //x, x,c0,c1,
-    v.val[1] =  _mm_unpackhi_epi32(v.val[0],v.val[2]); //a1,c0, b0, c1
-    v.val[2] =  _mm_srli_si128(v.val[1], 8);  //b0, c1, x, x,
+    __m128i val0, val1, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
+
+    val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
+    _M64(v.val[0], val0);
+    val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
+    val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
+    _M64(v.val[1], val1);
+    val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
+    _M64(v.val[2], val2);
     return v;
 }
-uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
 {
     uint64x1x3_t v;
-    v.val[0] = vld1q_u64 (ptr);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
 
-int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
-#endif
 
-int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
 
-int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
 
-float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
-{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
     float32x2x3_t v;
-    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,  b0,b1,
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 3);
+
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 4);
 
-    v.val[0] = _mm_shuffle_ps(v.val[0],v.val[0], _MM_SHUFFLE(2,1, 3, 0));  //a0,b1, a1, b0
-    v.val[2] =  _mm_movelh_ps(v.val[2], v.val[2]);  //x, x,c0,c1,
-    v.val[1] =  _mm_unpackhi_ps(v.val[0],v.val[2]); //a1,c0, b0, c1
-    v.val[2] =  _mm_movehl_ps(v.val[1], v.val[1]);  //b0, c1, x, x,
+    v.val[2].m64_f32[0] = *(ptr + 2);
+    v.val[2].m64_f32[1] = *(ptr + 5);
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
 #define vld3_p8 vld3_u8
 
-poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 #define vld3_p16 vld3_u16
-#endif
 
 //***************  Quadruples load ********************************
 //*****************************************************************
-uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr)         // VLD4.8 {d0, d2, d4, d6}, [r0]
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
 {
     uint8x16x4_t v;
-   __m128i tmp3, tmp2, tmp1, tmp0;
+    __m128i tmp3, tmp2, tmp1, tmp0;
 
     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
-    v.val[1] = vld1q_u8 ( (ptr + 16));//b0, b1,b2,...b7.... b15
-    v.val[2] = vld1q_u8 ( (ptr + 32));//c0, c1,c2,...c7....c15
+    v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
+    v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
 
-    tmp0= _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
-    tmp1= _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
-    tmp2= _mm_unpackhi_epi8(v.val[0],v.val[1]);//a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
-    tmp3= _mm_unpackhi_epi8(v.val[2],v.val[3]);//c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
+    tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
+    tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
+    tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
+    tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
 
     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
 
-    tmp0 =  _mm_unpacklo_epi32(v.val[0] , v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
-    tmp1 =  _mm_unpackhi_epi32(v.val[0] , v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
-    tmp2 =  _mm_unpacklo_epi32(v.val[1] , v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
-    tmp3 =  _mm_unpackhi_epi32(v.val[1] , v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
+    tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
+    tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
+    tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
+    tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
 
     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
@@ -5206,23 +10188,23 @@ _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr)
     return v;
 }
 
-uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr)         // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
 {
     uint16x8x4_t v;
     __m128i tmp3, tmp2, tmp1, tmp0;
-    tmp0  =  vld1q_u16 (ptr);       //a0,a1,a2,...a7
+    tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
-    v.val[0]= _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
-    v.val[1]= _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
-    v.val[2]= _mm_unpackhi_epi16(tmp0,tmp1);//a4,b4, a5,b5, a6,b6, a7,b7
-    v.val[3]= _mm_unpackhi_epi16(tmp2,tmp3);//c4,d4, c5,d5, c6,d6, c7,d7
-    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]);//a0,a4, b0,b4, a1,a5, b1,b5
+    v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
+    v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
+    v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
+    v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
+    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
-    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]);//c2,c6, d2,d6, c3,c7, d3,d7
+    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
@@ -5230,8 +10212,8 @@ _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr)
     return v;
 }
 
-uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
 {
     uint32x4x4_t v;
     __m128i tmp3, tmp2, tmp1, tmp0;
@@ -5250,20 +10232,20 @@ _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr)
     return v;
 }
 
-int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
 
-int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
 
-int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
 
-float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
 {
     float32x4x4_t v;
     __m128 tmp3, tmp2, tmp1, tmp0;
@@ -5283,346 +10265,377 @@ _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * pt
     return v;
 }
 
-poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
 #define vld4q_p8 vld4q_u8
 
-poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 #define vld4q_p16 vld4q_s16
 
-#if defined(USE_SSSE3)
-uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr)         // VLD4.8 {d0, d1, d2, d3}, [r0]
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
 {
     uint8x8x4_t v;
     __m128i sh0, sh1;
+    __m128i val0,  val2;
     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
 
-    v.val[0] = vld1q_u8(( ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[1] = vld1q_u8(( ptr + 16));         //load third and forth 64-bits in val[2], val[3]
-
-    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_8);
-    sh1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask4_8);
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
-    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
+    val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
 
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
+    vst1q_u8(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
+    vst1q_u8(&v.val[2], val2 );
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr)         // VLD4.16 {d0, d1, d2, d3}, [r0]
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
 {
     uint16x4x4_t v;
     __m128i sh0, sh1;
-    _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};         //0, 4, 1, 5, 2, 6, 3, 7
-    v.val[0] = vld1q_u16 ( (ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[2] = vld1q_u16 ( (ptr + 8));         //load third and forth 64-bits in val[2], val[3]
-    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_16);
-    sh1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask4_16);
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12, 1,5,9,13
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14, 3,7,11,15
-    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
+    __m128i val0, val2;
+    _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
+    val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
+    vst1q_u16(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
+    vst1q_u16(&v.val[2], val2 );
     return v;
 }
-#endif
 
-uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
-{   //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-     uint32x4x4_t v, res;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
-    v.val[2] =  vld1q_u32 ((ptr + 4));  //c0,c1, d0,d1
-    res.val[0] = _mm_unpacklo_epi32(v.val[0],v.val[2]);  //a0, c0, a1,c1,
-    res.val[2] = _mm_unpackhi_epi32(v.val[0],v.val[2]);  //b0,d0, b1, d1
-    res.val[1] = _mm_shuffle_epi32(res.val[0],_SWAP_HI_LOW32); //a1,c1, a0, c0,
-    res.val[3] = _mm_shuffle_epi32(res.val[2],_SWAP_HI_LOW32);//b1, d1,b0,d0,
-    return res;
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    uint32x2x4_t v;
+    __m128i val0, val01, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
+    val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
+    val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
+    vst1q_u32(&v.val[0], val01);
+    vst1q_u32(&v.val[2], val2 );
+    return v;
 }
 
-uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
 {
     uint64x1x4_t v;
-    v.val[0] = vld1q_u64( (ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[2] = vld1q_u64( (ptr + 2));         //load third and forth 64-bits in val[2], val[3]
+    v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
+    v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
+    v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
+    v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
 
-int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
-#endif
 
-int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
 
-int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
 
-float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr)         // VLD4.32 {d0, d1, d2, d3}, [r0]
-{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-    float32x2x4_t v, res;
-    v.val[0] =  vld1q_f32 ((float*) ptr);         //a0,a1,  b0,b1,
-    v.val[2] =  vld1q_f32 ((float*) (ptr + 4));         //c0,c1, d0,d1
-    res.val[0] = _mm_unpacklo_ps(v.val[0],v.val[2]);         //a0, c0, a1,c1,
-    res.val[2] = _mm_unpackhi_ps(v.val[0],v.val[2]);         //b0,d0, b1, d1
-    res.val[1] = _mm_movehl_ps(res.val[0],res.val[0]);          // a1,c1, a0, c0,
-    res.val[3] = _mm_movehl_ps(res.val[2],res.val[2]);         // b1, d1, b0,d0,
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    float32x2x4_t res;
+    res.val[0].m64_f32[0] = *(ptr);
+    res.val[0].m64_f32[1] = *(ptr + 4);
+    res.val[1].m64_f32[0] = *(ptr + 1);
+    res.val[1].m64_f32[1] = *(ptr + 5);
+    res.val[2].m64_f32[0] = *(ptr + 2);
+    res.val[2].m64_f32[1] = *(ptr + 6);
+    res.val[3].m64_f32[0] = *(ptr + 3);
+    res.val[3].m64_f32[1] = *(ptr + 7);
     return res;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
 #define vld4_p8 vld4_u8
 
-poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 #define vld4_p16 vld4_u16
-#endif
 
 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
 //*******************************************************************************************************************
-uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr)         // VLD2.8 {d0[], d1[]}, [r0]
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
 {
     uint8x8x2_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    __m128i val0, val1;
+    val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    vst1q_u8(v.val, val0);
     return v;
 }
 
-uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr)         // VLD2.16 {d0[], d1[]}, [r0]
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
 {
     uint16x4x2_t v;
-    v.val[1] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[1], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[1], 85);//01 01 01 01 (all 1)
+    __m128i val0, val1;
+    val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
+    _M64(v.val[0], val0);
+    val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
+    _M64(v.val[1], val1);
     return v;
 }
 
-uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
 {
     uint32x2x2_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,x,x
-    v.val[0] = _mm_shuffle_epi32(v.val[0],   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
-    v.val[1] = _mm_srli_si128(v.val[0], 8); //1,1,0x0,0x0
+    __m128i val0;
+    val0 = LOAD_SI128(ptr); //0,1,x,x
+    val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
+    vst1q_u32(v.val, val0);
     return v;
 }
 
-uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_dup_u64 vld2_u64
 
-int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
 
-int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
 
-int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
 
-int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
 
-float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
 {
     float32x2x2_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,x,x
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,x,x
+    v.val[0].m64_f32[0] = *(ptr); //0,0
+    v.val[0].m64_f32[1] = *(ptr); //0,0
+    v.val[1].m64_f32[0] = *(ptr + 1); //1,1
+    v.val[1].m64_f32[1] = *(ptr + 1); //1,1
     return v;
 }
 
-poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
 #define vld2_dup_p8 vld2_dup_u8
 
-poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 #define vld2_dup_p16 vld2_dup_s16
 
 //************* Duplicate (or propagate)triplets: *******************
 //********************************************************************
 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
-uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr)         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
 {
     uint8x8x3_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
+    vst1q_u8(v.val, val0);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr)         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
 {
     uint16x4x3_t v;
-    v.val[2] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[2], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[2], 85);//01 01 01 01 (all 1)
-    v.val[2] = _mm_shufflelo_epi16(v.val[2], 170);//10 10 10 10 (all 2)
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
 {
     uint32x2x3_t v;
-    v.val[2] = LOAD_SI128(ptr); //0,1,2,x
-    v.val[0] = _mm_shuffle_epi32(v.val[2],   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
-    v.val[1] = _mm_shuffle_epi32(v.val[2],   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
-    v.val[2] = _mm_srli_si128(v.val[0], 8); //2,2,0x0,0x0
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x
+    val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
+    val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
+    val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
 {
     uint64x1x3_t v;
-    v.val[0] = LOAD_SI128(ptr);//0,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
-    v.val[2] = LOAD_SI128((ptr + 2));  //2,x
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
     return v;
 }
 
-int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
 
-int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
 
-int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
 
-int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
 
-float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
 {
     float32x2x3_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,2,x
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
-    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
+    int i;
+    for (i = 0; i<3; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
     return v;
 }
 
-poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_p8 vld3_dup_u8
 
-poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_p16 vld3_dup_s16
 
+
 //************* Duplicate (or propagate) quadruples: *******************
 //***********************************************************************
 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
-uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr)         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint8x8x4_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32);
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
+    vst1q_u8(&v.val[0], val0);
+    vst1q_u8(&v.val[2], val2);
     return v;
 }
 
-uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr)         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint16x4x4_t v;
-    v.val[3] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[3], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[3], 85);//01 01 01 01 (all 1)
-    v.val[2] = _mm_shufflelo_epi16(v.val[3], 170);//10 10 10 10 (all 2)
-    v.val[3] = _mm_shufflelo_epi16(v.val[3], 255);//11 11 11 11 (all 3)
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
+    val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
     return v;
 }
 
-uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint32x2x4_t v;
-    v.val[3] = LOAD_SI128(ptr) ; //0,1,2,3
-    v.val[0] = _mm_shuffle_epi32(v.val[3],   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
-    v.val[1] = _mm_shuffle_epi32(v.val[3],   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
-    v.val[2] = _mm_shuffle_epi32(v.val[3],   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
-    v.val[3] = _mm_shuffle_epi32(v.val[3],   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3
+    val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
+    val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
+    val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
+    val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
     return v;
 }
 
-uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
 {
     uint64x1x4_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
-    v.val[2] = LOAD_SI128((ptr + 2)); //2,3
-    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); //3,2
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
+    v.val[3].m64_u64[0] = *(ptr + 3);
     return v;
 }
 
-int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
 
-int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
 
-int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
 
-int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
 
-float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 {
     float32x2x4_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,2,3
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,3,3
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
-    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
-    v.val[3] = _mm_movehl_ps(v.val[1], v.val[1]); //3,3,1,1,
+    int i;
+    for (i = 0; i<4; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
     return v;
 }
 
-poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_p8 vld4_dup_u8
 
-poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_p16 vld4_dup_u16
 
+
 //**********************************************************************************
 //*******************Lane loads for  an N-element structures ***********************
 //**********************************************************************************
@@ -5634,7 +10647,7 @@ poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VL
 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
 
 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane)         // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
 {
     uint16x8x2_t v;
     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
@@ -5644,7 +10657,7 @@ _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t cons
 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
 {
     uint32x4x2_t v;
     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
@@ -5676,59 +10689,80 @@ _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const
 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-//float32x4x2_t vld2q_lane_f32(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
 {
     float32x4x2_t v;
     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
     return v;
 }
-#define vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32_ptr(ptr, &src, lane)
+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
 
 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
 #define vld2q_lane_p16 vld2q_lane_u16
 
 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane)         // VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
 {
-    uint8x8x2_t val;
-    val.val[0] = _MM_INSERT_EPI8 (src->val[0], (int)ptr[0], lane);
-    val.val[1] = _MM_INSERT_EPI8 (src->val[1], (int)ptr[1], lane);
-    return val;
+    uint8x8x2_t v;
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    return v;
 }
 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
 
 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-#define vld2_lane_u16 vld2q_lane_u16
+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
+{
+    uint16x4x2_t v;
+    v.val[0]  =  vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1]  = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-#define vld2_lane_u32 vld2q_lane_u32
+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
+{
+    uint32x2x2_t v;
+    v.val[0]  =  vld1_lane_u32(ptr, src->val[0], lane);
+    v.val[1]  = vld1_lane_u32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
 
 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
 
 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
 
 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
 
 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-#define vld2_lane_f32 vld2q_lane_f32
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
+{
+    float32x2x2_t v;
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
 
 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
 #define vld2_lane_p8 vld2_lane_u8
 
 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 #define vld2_lane_p16 vld2_lane_u16
 
 //*********** Lane triplets **********************
@@ -5737,7 +10771,7 @@ poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x
 //we assume src is 16 bit aligned
 
 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 {
     uint16x8x3_t v;
     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
@@ -5748,7 +10782,7 @@ _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t cons
 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     uint32x4x3_t v;
     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
@@ -5759,7 +10793,7 @@ _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t cons
 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
 
 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 {
     int16x8x3_t v;
     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
@@ -5770,7 +10804,7 @@ _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const
 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
 
 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     int32x4x3_t v;
     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
@@ -5780,12 +10814,13 @@ _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const
 }
 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
 
-float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 //current IA SIMD doesn't support float16
 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
 
+
 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     float32x4x3_t v;
     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
@@ -5793,64 +10828,67 @@ _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t co
     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
     return v;
 }
-#define vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32_ptr(ptr, &src, lane)
+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
 
-poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 #define vld3q_lane_p16 vld3q_lane_u16
 
 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane)         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 {
     uint8x8x3_t v;
-    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
     return v;
 }
 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
 
 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane)         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 {
     uint16x4x3_t v;
-    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
     return v;
 }
 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-{         //need to merge into 128 bit anyway
+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{
+    //need to merge into 128 bit anyway
     uint32x2x3_t v;
-    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
+    v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
+    v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
     return v;
 }
 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
 
-int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
 
-int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
 
-int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
 
-float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 //current IA SIMD doesn't support float16
 
 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
 {
     float32x2x3_t v;
-    v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
     return v;
 }
-#define vld3_lane_f32(ptr, src, lane) vld3_lane_f32_ptr(ptr, &src, lane)
+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
 
 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_p8 vld3_lane_u8
@@ -5888,15 +10926,15 @@ _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t cons
 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
 
 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
 
 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
 
 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 //current IA SIMD doesn't support float16
 
 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
@@ -5909,20 +10947,20 @@ _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t co
     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
     return v;
 }
-#define vld4q_lane_f32(ptr, src, lane) vld4q_lane_f32_ptr(ptr, &src, lane)
+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
 
 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_p16 vld4q_lane_u16
 
 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
 {
     uint8x8x4_t v;
-    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI8 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
@@ -5931,10 +10969,10 @@ _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const *
 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
 {
     uint16x4x4_t v;
-    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI16 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
@@ -5943,10 +10981,10 @@ _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const
 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
 {
     uint32x2x4_t v;
-    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI32 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
@@ -5969,11 +11007,16 @@ float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x
 
 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
-{         //serial solution may be faster
+{
+    //serial solution may be faster
     float32x2x4_t v;
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
     return v;
 }
-#define vld4_lane_f32(ptr, src, lane) vld4_lane_f32_ptr(ptr, &src, lane)
+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
 
 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
@@ -6058,37 +11101,37 @@ void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
 {
-    uint8x8x2_t v;
-    v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
-    vst1q_u8 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u8 (ptr, v0);
 }
 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
 
 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
 {
-    uint16x4x2_t v;
-    v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
-    vst1q_u16 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u16 (ptr, v0);
 }
 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
 
 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
 {
-    uint32x2x2_t v;
-    v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
-    vst1q_u32 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u32 (ptr, v0);
 }
 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
 
+
 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
 {
-    uint64x1x2_t v;
-    v.val[0] = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    vst1q_u64(ptr, v.val[0]);
+    *(ptr) = val->val[0].m64_u64[0];
+    *(ptr + 1) = val->val[1].m64_u64[0];
 }
 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
 
@@ -6107,12 +11150,13 @@ _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_
 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
 {
-    float32x4x2_t v;
-    v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
-    vst1q_f32 (ptr, v.val[0]);
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[0].m64_f32[1];
+    *(ptr + 3) = val->val[1].m64_f32[1];
 }
 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
 
@@ -6125,7 +11169,6 @@ _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2
 //******************** Triplets store  *****************************************
 //******************************************************************************
 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
 {
     uint8x16x3_t v;
@@ -6137,30 +11180,28 @@ _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_
     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
 
-    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
-    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]);         //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
-    v1 =  _mm_alignr_epi8(v2, v0, 11);         //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
-    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
-    cff = _mm_cmpeq_epi8(v0, v0);         //all ff
+    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
+    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
+    v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi8(v0, v0); //all ff
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
     vst1q_u8(ptr,   v.val[0]);
-    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
     vst1q_u8((ptr + 16),  v.val[1]);
-    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
     vst1q_u8((ptr + 32),  v.val[2]);
 }
 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
-#endif
 
-#if defined(USE_SSSE3)
 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
 {
@@ -6173,32 +11214,32 @@ _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x
     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
 
-    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10
-    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]);         //12,13, 15,16, 18,19, 21,22,
-    v1 =  _mm_alignr_epi8(v2, v0, 12);         //9,10, 12,13, 15,16, 18,19
-    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
-    cff = _mm_cmpeq_epi16(v0, v0);         //all ff
+    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
+    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
+    v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi16(v0, v0); //all ff
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
     vst1q_u16(ptr,      v.val[0]);
-    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
     vst1q_u16((ptr + 8),  v.val[1]);
-    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
     vst1q_u16((ptr + 16), v.val[2]);
 }
 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
-#endif
 
 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
-{   //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
     uint32x4x3_t v;
     __m128i tmp0, tmp1,tmp2;
     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
@@ -6216,7 +11257,6 @@ _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x
 }
 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
@@ -6224,7 +11264,6 @@ void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
-#endif
 
 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
@@ -6237,7 +11276,7 @@ void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
 {
-     float32x4x3_t  v;
+    float32x4x3_t v;
     __m128 tmp0, tmp1,tmp2;
     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
@@ -6254,7 +11293,6 @@ _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x
 }
 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
 #define vst3q_p8 vst3q_u8
@@ -6262,81 +11300,78 @@ void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
 #define vst3q_p16 vst3q_u16
-#endif
 
 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
 {
-    uint8x8x3_t v;
-    __m128i tmp, sh0, sh1;
+    __m128i tmp, sh0, sh1, val0, val2;
     _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
     _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
     _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
-    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0);         //for bi>15 bi is wrapped (bi-=15)
-    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
-    v.val[0] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
-    vst1q_u8(ptr,   v.val[0]);         //store as 128 bit structure
-    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1);         //for bi>15 bi is wrapped (bi-=15)
-    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
-    v.val[1] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+    tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
+    val2 = _pM128i(val->val[2]);
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
+    vst1q_u8(ptr,   val0); //store as 128 bit structure
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+    _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
 }
 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
-#endif
 
 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
 {
-    uint16x4x3_t v;
-    __m128i tmp;
+    __m128i tmp, val0, val1, val2;
     _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
-    _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff};         //if all ones we take the result from v.val[0]  otherwise from v.val[1]
-    _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff};         //if all ones we take the result from v.val[1]  otherwise from v.val[0]
-    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
-    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
-    v.val[0] = _MM_BLENDV_EPI8(v.val[1], v.val[0], *(__m128i*)mask0f);
-    vst1q_u16(ptr,     v.val[0]);         //store as 128 bit structure
-    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
-    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
-    v.val[1] = _MM_BLENDV_EPI8(v.val[0], v.val[1],  *(__m128i*)mask1f);         //change the operands order
+    _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
+    _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
+    tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
+    val2 = _pM128i(val->val[2]);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
+    vst1q_u16(ptr,    val0); //store as 128 bit structure
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
+    _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
 }
 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
-#endif
 
 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
-{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-    uint32x2x3_t res;
-    res.val[0] = _mm_unpacklo_epi64(val->val[1], val->val[2]);         //val[0]: 1,4,2,5
-    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));         //1,2,4,5
-    res.val[1] = _mm_srli_si128(res.val[0], 8);         //4,5, x,x
-    res.val[0] = _mm_unpacklo_epi32(val->val[0], res.val[0]);         //0,1,3,2
-    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (1 << 2) | (3 << 4) | (2 << 6));         //0,1,2, 3
-    vst1q_u32(ptr, res.val[0]);         //store as 128 bit structure
+{
+    //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
+    __m128i val0, val1;
+    val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
+    val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
+    val1 = _mm_srli_si128(val0, 8); //4,5, x,x
+    _M64((*(__m64_128*)(ptr + 4)),  val1);
+    val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
+    val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
+    vst1q_u32(ptr, val0); //store as 128 bit structure
 }
 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
 
 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
 {
-    __m128i tmp;
-    tmp =  _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    vst1q_u64(ptr, tmp);         //store as 128 bit structure
+    *(ptr) = val->val[0].m64_u64[0];
+    *(ptr + 1) = val->val[1].m64_u64[0];
+    *(ptr + 2) = val->val[2].m64_u64[0];
 }
 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val)  // VST3.8 {d0, d1, d2}, [r0]
 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
 
 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val)  // VST3.16 {d0, d1, d2}, [r0]
 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
-#endif
 
 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
@@ -6345,23 +11380,22 @@ _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_
 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
 
 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
-{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-    float32x2x3_t res;
-    res.val[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(val->val[1]), _mm_castps_si128(val->val[2])) );
-    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(3,1,2,0));         //1,2,4,5
-    res.val[1] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(1,0,3,2));         //4,5, 1,2
-    res.val[0] = _mm_unpacklo_ps(val->val[0], res.val[0]);         //0,1,3, 2
-    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(2,3,1,0));         //0,1,2, 3
-    vst1q_f32(ptr, res.val[0]);         //store as 128 bit structure
+{
+    //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[2].m64_f32[0];
+    *(ptr + 3) = val->val[0].m64_f32[1];
+    *(ptr + 4) = val->val[1].m64_f32[1];
+    *(ptr + 5) = val->val[2].m64_f32[1];
 }
 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
 #define vst3_p8 vst3_u8
@@ -6369,7 +11403,6 @@ void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
 #define vst3_p16 vst3_s16
-#endif
 
 //***************  Quadruples store ********************************
 //*********************************************************************
@@ -6377,17 +11410,17 @@ void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
 {
     __m128i tmp1, tmp2, res;
-    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]);         //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
-    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]);         //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
-    res = _mm_unpacklo_epi16(tmp1, tmp2);         //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
+    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
+    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
     vst1q_u8(ptr,  res);
-    res = _mm_unpackhi_epi16(tmp1, tmp2);         //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
     vst1q_u8((ptr + 16), res);
-    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]);         //
-    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]);         //
-    res = _mm_unpacklo_epi16(tmp1, tmp2);         //
+    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
+    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //
     vst1q_u8((ptr + 32), res);
-    res = _mm_unpackhi_epi16(tmp1, tmp2);         //
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //
     vst1q_u8((ptr + 48), res);
 }
 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
@@ -6397,12 +11430,12 @@ _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x
 {
     uint16x8x4_t v;
     __m128i tmp1, tmp2;
-    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
-    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
     vst1q_u16(ptr,     v.val[0]);
@@ -6417,12 +11450,12 @@ _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x
 {
     uint16x8x4_t v;
     __m128i tmp1, tmp2;
-    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
-    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
     vst1q_u32(ptr,      v.val[0]);
@@ -6479,50 +11512,50 @@ void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
 {
-    uint8x8x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi8(val->val[0],val->val[1]);         // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
-    sh1 = _mm_unpacklo_epi8(val->val[2],val->val[3]);         // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
-    v.val[0] = _mm_unpacklo_epi16(sh0,sh1);         // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
-    v.val[2] = _mm_unpackhi_epi16(sh0,sh1);         //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
-    vst1q_u8(ptr,      v.val[0]);
-    vst1q_u8((ptr + 16),  v.val[2]);
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
+    sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
+    val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
+    val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
+    vst1q_u8(ptr,    val0);
+    vst1q_u8((ptr + 16),  val2);
 }
 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
 
 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
 {
-    uint16x4x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi16(val->val[0],val->val[1]);         //a0,a1,b0,b1,c0,c1,d0,d1,
-    sh1 = _mm_unpacklo_epi16(val->val[2],val->val[3]);         //a2,a3,b2,b3,c2,c3,d2,d3
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         // a0,a1,a2,a3,b0,b1,b2,b3
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         // c0,c1,c2,c3,d0,d1,d2,d3
-    vst1q_u16(ptr,      v.val[0]);         //store as 128 bit structure
-    vst1q_u16((ptr + 8),  v.val[2]);
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
+    sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
+    val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
+    val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
+    vst1q_u16(ptr,      val0); //store as 128 bit structure
+    vst1q_u16((ptr + 8),  val2);
 }
 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
 
 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
-{         //0,4,   1,5,  2,6,  3,7
-    uint32x2x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1,4,5
-    sh1 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3,6,7
-    v.val[0] = _mm_unpacklo_epi64(sh0,sh1);         //
-    v.val[1] = _mm_unpackhi_epi64(sh0,sh1);         //
-    vst1q_u32(ptr,     v.val[0]);         //store as 128 bit structure
-    vst1q_u32((ptr + 4),  v.val[1]);
+{
+    //0,4,   1,5,  2,6,  3,7
+    __m128i sh0, sh1, val0, val1;
+    sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
+    sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
+    val0 = _mm_unpacklo_epi64(sh0,sh1); //
+    val1 = _mm_unpackhi_epi64(sh0,sh1); //
+    vst1q_u32(ptr,     val0); //store as 128 bit structure
+    vst1q_u32((ptr + 4),  val1);
 }
 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
 
 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
 {
-    vst1q_u64(ptr,    val->val[0]);
-    vst1q_u64((ptr + 2), val->val[2]);
+    *(ptr) =  val->val[0].m64_u64[0];
+    *(ptr + 1) =  val->val[1].m64_u64[0];
+    *(ptr + 2) =  val->val[2].m64_u64[0];
+    *(ptr + 3) =  val->val[3].m64_u64[0];
 }
 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
 
@@ -6545,14 +11578,16 @@ void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
 
 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
-{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-    float32x2x4_t v;
-    v.val[0] = _mm_unpacklo_ps(val->val[0],val->val[1]);
-    v.val[2] = _mm_unpacklo_ps(val->val[2],val->val[3]);
-    v.val[1] = _mm_movelh_ps (v.val[0], v.val[2]);         //a0, c0, a1,c1,
-    v.val[3] = _mm_movehl_ps (v.val[2],v.val[0]);         //b0,d0, b1, d1
-    vst1q_f32(ptr,     v.val[1]);         //store as 128 bit structure
-    vst1q_f32((ptr + 4),  v.val[3]);
+{
+    //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[2].m64_f32[0];
+    *(ptr + 3) = val->val[3].m64_f32[0];
+    *(ptr + 4) = val->val[0].m64_f32[1];
+    *(ptr + 5) = val->val[1].m64_f32[1];
+    *(ptr + 6) = val->val[2].m64_f32[1];
+    *(ptr + 7) = val->val[3].m64_f32[1];
 }
 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
 
@@ -6600,19 +11635,38 @@ _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float
     vst1q_lane_f32(ptr, val->val[0], lane);
     vst1q_lane_f32((ptr + 1), val->val[1], lane);
 }
-#define vst2q_lane_f32(ptr, val, lane) vst2q_lane_f32_ptr(ptr, &val, lane)
+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
 
 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
 #define vst2q_lane_p16 vst2q_lane_s16
 
+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
+{
+    *(ptr) = val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+}
+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
+
 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16 {d0[0], d1[0]}, [r0]
-#define vst2_lane_u16 vst2q_lane_u16
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
+{
+    *(ptr) = val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+}
+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
 
 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
-#define vst2_lane_u32 vst2q_lane_u32
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+}
+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
 
 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
@@ -6620,17 +11674,22 @@ void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constr
 
 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
-#define vst2_lane_s16 vst2q_lane_s16
+#define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
 
 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
-#define vst2_lane_s32 vst2q_lane_s32
+#define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
 
 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
-#define vst2_lane_f32 vst2q_lane_f32
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+}
+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
 
 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
 #define vst2_lane_p8 vst2_lane_u8
@@ -6675,7 +11734,38 @@ _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, floa
     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
     vst1q_lane_f32((ptr + 2), val->val[2], lane);
 }
-#define vst3q_lane_f32(ptr, val, lane) vst3q_lane_f32_ptr(ptr, &val, lane)
+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
+
+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
+#define vst3q_lane_p16 vst3q_lane_s16
+
+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
+{
+    *(ptr) =     val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+    *(ptr + 2) = val->val[2].m64_u8[lane];
+}
+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
+
+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
+{
+    *(ptr) =     val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+    *(ptr + 2) = val->val[2].m64_u16[lane];
+}
+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
+
+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
+{
+    *(ptr) =     val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+    *(ptr + 2) = val->val[2].m64_u32[lane];
+}
+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
 
 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
@@ -6695,7 +11785,13 @@ void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __co
 
 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
-#define vst3_lane_f32 vst3q_lane_f32
+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+    *(ptr + 2) = val->val[2].m64_f32[lane];
+}
+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
 
 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
@@ -6743,7 +11839,7 @@ _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, floa
     vst1q_lane_f32((ptr + 2), val->val[2], lane);
     vst1q_lane_f32((ptr + 3), val->val[3], lane);
 }
-#define vst4q_lane_f32(ptr, val, lane) vst4q_lane_f32_ptr(ptr, &val, lane)
+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
 
 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
@@ -6752,31 +11848,30 @@ void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __
 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
 {
-    vst1q_lane_u8(ptr,   val->val[0], lane);
-    vst1q_lane_u8((ptr + 1),  val->val[1], lane);
-    vst1q_lane_u8((ptr + 2), val->val[2], lane);
-    vst1q_lane_u8((ptr + 3), val->val[3], lane);
+    *(ptr) =     val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+    *(ptr + 2) = val->val[2].m64_u8[lane];
+    *(ptr + 3) = val->val[3].m64_u8[lane];
 }
 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
 
 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
 {
-    vst1q_lane_u16(ptr,   val->val[0], lane);
-    vst1q_lane_u16((ptr + 1),val->val[1], lane);
-    vst1q_lane_u16((ptr + 2), val->val[2], lane);
-    vst1q_lane_u16((ptr + 3), val->val[3], lane);
+    *(ptr) =     val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+    *(ptr + 2) = val->val[2].m64_u16[lane];
+    *(ptr + 3) = val->val[3].m64_u16[lane];
 }
 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
 
 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
 {
-    vst1q_lane_u32(ptr,   val->val[0], lane);
-    vst1q_lane_u32((ptr + 1), val->val[1], lane);
-    vst1q_lane_u32((ptr + 2), val->val[2], lane);
-    vst1q_lane_u32((ptr + 3), val->val[3], lane);
-
+    *(ptr) =     val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+    *(ptr + 2) = val->val[2].m64_u32[lane];
+    *(ptr + 3) = val->val[3].m64_u32[lane];
 }
 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
 
@@ -6793,8 +11888,15 @@ _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32
 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
 //current IA SIMD doesn't support float16
 
-//void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-#define vst4_lane_f32 vst4q_lane_f32
+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+    *(ptr + 2) = val->val[2].m64_f32[lane];
+    *(ptr + 3) = val->val[3].m64_f32[lane];
+}
+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
 
 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
@@ -6808,32 +11910,59 @@ void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __c
 //************************ Extract lanes from a vector ********************************************
 //**************************************************************************************************
 //These intrinsics extract a single lane (element) from a vector.
+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
 
-uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
+
+
+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
+
+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
+
+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
+
+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
+
+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_p8 vget_lane_u8
+
+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_p16 vget_lane_u16
+
+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
+
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
 
-uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
 #define  vgetq_lane_u16 _MM_EXTRACT_EPI16
 
-uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
 
-int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
 #define vgetq_lane_s8 vgetq_lane_u8
 
-int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
 #define vgetq_lane_s16 vgetq_lane_u16
 
-int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 #define vgetq_lane_s32 vgetq_lane_u32
 
-poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
 #define vgetq_lane_p8 vgetq_lane_u8
 
-poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
 #define vgetq_lane_p16 vgetq_lane_u16
 
-float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
 {
     int32_t ilane;
@@ -6841,10 +11970,17 @@ _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int
     return *(float*)&ilane;
 }
 
-int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_s64(vec, lane) vec.m64_i64[0]
+
+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_u64(vec, lane) vec.m64_u64[0]
+
+
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
 
-uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
 
 // ***************** Set lanes within a vector ********************************************
@@ -6852,7 +11988,69 @@ uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         //
 //These intrinsics set a single lane (element) within a vector.
 //same functions as vld1_lane_xx ones, but take the value to be set directly.
 
-uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8_t val;
+    val = value;
+    return vld1_lane_u8(&val, vec,  lane);
+}
+
+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16_t val;
+    val = value;
+    return vld1_lane_u16(&val, vec,  lane);
+}
+
+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32_t val;
+    val = value;
+    return vld1_lane_u32(&val, vec,  lane);
+}
+
+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
+{
+    int8_t val;
+    val = value;
+    return vld1_lane_s8(&val, vec,  lane);
+}
+
+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
+{
+    int16_t val;
+    val = value;
+    return vld1_lane_s16(&val, vec,  lane);
+}
+
+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
+{
+    int32_t val;
+    val = value;
+    return vld1_lane_s32(&val, vec,  lane);
+}
+
+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+#define vset_lane_p8  vset_lane_u8
+
+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+#define vset_lane_p16  vset_lane_u16
+
+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32_t val;
+    val = value;
+    return vld1_lane_f32(&val, vec,  lane);
+}
+
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
 {
     uint8_t val;
@@ -6860,7 +12058,7 @@ _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __const
     return vld1q_lane_u8(&val, vec,  lane);
 }
 
-uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
 {
     uint16_t val;
@@ -6868,7 +12066,7 @@ _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __con
     return vld1q_lane_u16(&val, vec,  lane);
 }
 
-uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
 {
     uint32_t val;
@@ -6876,7 +12074,7 @@ _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __con
     return vld1q_lane_u32(&val, vec,  lane);
 }
 
-int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
 {
     int8_t val;
@@ -6884,7 +12082,7 @@ _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constran
     return vld1q_lane_s8(&val, vec,  lane);
 }
 
-int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
 {
     int16_t val;
@@ -6892,7 +12090,7 @@ _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constr
     return vld1q_lane_s16(&val, vec,  lane);
 }
 
-int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
 {
     int32_t val;
@@ -6900,20 +12098,37 @@ _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constr
     return vld1q_lane_s32(&val, vec,  lane);
 }
 
-poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 #define vsetq_lane_p8 vsetq_lane_u8
 
-poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 #define vsetq_lane_p16 vsetq_lane_u16
 
-float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
 {
     float32_t val;
     val = value;
+    return vld1q_lane_f32(&val, vec,  lane);
+}
+
+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
+{
+    int64_t val;
+    val = value;
+    return vld1_lane_s64(&val, vec,  lane);
 }
 
-int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64_t val;
+    val = value;
+    return vld1_lane_u64(&val, vec,  lane);
+}
+
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
 {
     uint64_t val;
@@ -6921,134 +12136,580 @@ _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constr
     return vld1q_lane_s64(&val, vec,  lane);
 }
 
-uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 #define vsetq_lane_u64 vsetq_lane_s64
 
 // *******************************************************************************
 // **************** Initialize a vector from bit pattern ***************************
 // *******************************************************************************
 //These intrinsics create a vector from a literal bit pattern.
+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s8(a)  (*(__m64_128*)&(a))
+
 
+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s16  vcreate_s8
+
+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s32  vcreate_s8
+
+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
 //no IA32 SIMD avalilable
 
+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_f32(a)  (*(__m64_128*)&(a))
+
+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u8 vcreate_s8
+
+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u16 vcreate_s16
+
+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u32 vcreate_s32
+
+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u64  vcreate_s8
+
+
+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p8 vcreate_u8
+
+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p16 vcreate_u16
+
+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s64 vcreate_u64
+
 //********************* Set all lanes to same value ********************************
 //*********************************************************************************
 //These intrinsics set all lanes to the same value.
+uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = value;
+    }
+    return res;
+}
+
+uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_u16[i] = value;
+    }
+    return res;
+}
+
+uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = value;
+    res.m64_u32[1] = value;
+    return res;
+}
+
+int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_i8[i] = value;
+    }
+    return res;
+}
+
+int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_i16[i] = value;
+    }
+    return res;
+}
+
+int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    res.m64_i32[0] = value;
+    res.m64_i32[1] = value;
+    return res;
+}
+
+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vdup_n_p8 vdup_n_u8
 
-uint8x16_t   vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vdup_n_p16 vdup_n_s16
+
+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
+{
+    float32x2_t res;
+    res.m64_f32[0] = value;
+    res.m64_f32[1] = value;
+    return res;
+}
+
+uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
 
-uint16x8_t   vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
 
-uint32x4_t   vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
 
-int8x16_t   vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
 #define vdupq_n_s8 _mm_set1_epi8
 
-int16x8_t   vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
 #define vdupq_n_s16 _mm_set1_epi16
 
-int32x4_t   vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
+int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
 #define vdupq_n_s32 _mm_set1_epi32
 
-poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
 #define  vdupq_n_p8 vdupq_n_u8
 
-poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
 #define  vdupq_n_p16 vdupq_n_u16
 
-float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
 #define vdupq_n_f32 _mm_set1_ps
 
-int64x2_t   vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
+{
+    int64x1_t res;
+    res.m64_i64[0] = value;
+    return res;
+}
+
+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = value;
+    return res;
+}
+
+int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
 {
-    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value};         //value may be an immediate
+    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
     return LOAD_SI128(value2);
 }
 
-uint64x2_t   vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
 {
-    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value};         //value may be an immediate
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
     return LOAD_SI128(val);
 }
 
 //****  Set all lanes to same value  ************************
 //Same functions as above - just aliaces.********************
 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+#define vmov_n_u8 vdup_n_s8
+
+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+#define vmov_n_u16 vdup_n_s16
 
-uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+#define vmov_n_u32 vdup_n_u32
+
+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+#define vmov_n_s8 vdup_n_s8
+
+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+#define vmov_n_s16 vdup_n_s16
+
+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+#define vmov_n_s32 vdup_n_s32
+
+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vmov_n_p8 vdup_n_u8
+
+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vmov_n_p16 vdup_n_s16
+
+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+#define vmov_n_f32 vdup_n_f32
+
+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
 #define vmovq_n_u8 vdupq_n_u8
 
-uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
 #define vmovq_n_u16 vdupq_n_s16
 
-uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
 #define vmovq_n_u32 vdupq_n_u32
 
-int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
 #define vmovq_n_s8 vdupq_n_s8
 
-int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
 #define vmovq_n_s16 vdupq_n_s16
 
-int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
 #define vmovq_n_s32 vdupq_n_s32
 
-poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
 #define vmovq_n_p8 vdupq_n_u8
 
-poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
 #define vmovq_n_p16 vdupq_n_s16
 
-float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
 #define vmovq_n_f32 vdupq_n_f32
 
-int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+#define vmov_n_s64 vdup_n_s64
+
+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+#define vmov_n_u64 vdup_n_u64
+
+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
 #define vmovq_n_s64 vdupq_n_s64
 
-uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
 #define vmovq_n_u64 vdupq_n_u64
 
 //**************Set all lanes to the value of one lane of a vector *************
 //****************************************************************************
 //here shuffle is better solution than lane extraction followed by set1 function
+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    uint8_t valane;
+    int i = 0;
+    valane = vec.m64_u8[lane];
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = valane;
+    }
+    return res;
+}
+
+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    uint16_t valane;
+    valane = vec.m64_u16[lane];
+    res.m64_u16[0] = valane;
+    res.m64_u16[1] = valane;
+    res.m64_u16[2] = valane;
+    res.m64_u16[3] = valane;
+    return res;
+}
+
+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = vec.m64_u32[lane];
+    res.m64_u32[1] = res.m64_u32[0];
+    return res;
+}
+
+int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_s8 vdup_lane_u8
+
+int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_s16 vdup_lane_u16
+
+int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+#define vdup_lane_s32 vdup_lane_u32
+
+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_p8 vdup_lane_u8
+
+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_p16 vdup_lane_s16
+
+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[lane];
+    res.m64_f32[1] = res.m64_f32[0];
+    return res;
+}
+
+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
+{
+    _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
+}
+
+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
+{
+    //we could use 8bit shuffle for 16 bit as well
+    const int8_t lane16 = ((int8_t) lane) << 1;
+    _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
+                                                lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
+}
+
+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define vdupq_lane_u32(vec,  lane) _mm_shuffle_epi32 (_pM128i(vec),  lane | (lane << 2) | (lane << 4) | (lane << 6))
+
+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_s8 vdupq_lane_u8
+
+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_s16 vdupq_lane_u16
+
+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define vdupq_lane_s32 vdupq_lane_u32
+
+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_p8 vdupq_lane_u8
+
+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_p16 vdupq_lane_s16
+
+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
+
+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_s64(vec,lane) vec
+
+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_u64(vec,lane) vec
+
+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
+{
+    __m128i vec128;
+    vec128 = _pM128i(vec);
+    return _mm_unpacklo_epi64(vec128,vec128);
+}
+
+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+#define vdupq_lane_u64 vdupq_lane_s64
 
 // ********************************************************************
 // ********************  Combining vectors *****************************
 // ********************************************************************
 //These intrinsics join two 64 bit vectors into a single 128bit vector.
+int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+#define vcombine_s8(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+#define vcombine_s16(low, high)    _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+#define vcombine_s32(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
 
+int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+#define vcombine_s64(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
 //current IA SIMD doesn't support float16
 
+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
+{
+    __m128i res;
+    res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
+    return _M128(res);
+}
+
+uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+#define vcombine_u8 vcombine_s8
+
+uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+#define vcombine_u16 vcombine_s16
+
+uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+#define vcombine_u32 vcombine_s32
+
+uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+#define vcombine_u64 vcombine_s64
+
+poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+#define vcombine_p8 vcombine_u8
+
+poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
+#define vcombine_p16 vcombine_u16
+
 //**********************************************************************
 //************************* Splitting vectors **************************
 //**********************************************************************
 //**************** Get high part ******************************************
 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
 
+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
+{
+    int64x1_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
 // IA32 SIMD doesn't work with 16bit floats currently
 
+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
+{
+    __m128i res;
+    __m64_128 res64;
+    res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
+    return64(res);
+}
+
+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_high_u8 vget_high_s8
+
+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_high_u16 vget_high_s16
+
+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_high_u32 vget_high_s32
+
+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_high_u64 vget_high_s64
+
+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_high_p8 vget_high_u8
+
+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_high_p16 vget_high_u16
+
 //********************** Get low part **********************
 //**********************************************************
+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
+
+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
+
+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
+{
+    int32x2_t res64;
+    return64(a);
+}
 
+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
+{
+    int64x1_t res64;
+    return64 (a);
+}
+
+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
 // IA32 SIMD doesn't work with 16bit floats currently
 
+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
+{
+    float32x2_t res64;
+    _M64f(res64, a);
+    return res64;
+}
+
+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_low_u8 vget_low_s8
+
+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_low_u16 vget_low_s16
+
+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_low_u32 vget_low_s32
+
+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_low_u64 vget_low_s64
+
+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_low_p8 vget_low_u8
+
+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_low_p16 vget_low_s16
+
 //**************************************************************************
 //************************ Converting vectors **********************************
 //**************************************************************************
 //************* Convert from float ***************************************
 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
+int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+_NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
+    return64(res);
+}
+
+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
+{
+    //may be not effective compared with a serial SIMD solution
+    uint32x2_t res64;
+    __m128i res;
+    res = vcvtq_u32_f32(_pM128(a));
+    return64(res);
+}
 
-int32x4_t   vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
-#define vcvtq_s32_f32 _mm_cvtps_epi32
+int32x4_t   vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+#define vcvtq_s32_f32 _mm_cvttps_epi32
 
-uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
-_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a)         // VCVT.U32.F32 q0, q0
-{         //No single instruction SSE solution  but we could implement it as following:
+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
+{
+    //No single instruction SSE solution  but we could implement it as following:
     __m128i resi;
     __m128 zero,  mask, a_pos, mask_f_max_si, res;
     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
@@ -7056,30 +12717,85 @@ _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a)         // VCVT.U32.F32
     mask = _mm_cmpgt_ps(a, zero);
     a_pos = _mm_and_ps(a, mask);
     mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
-    res =  _mm_sub_ps(a_pos, mask_f_max_si);         //if the input fits to signed we don't subtract anything
-    resi = _mm_cvtps_epi32(res);
+    res =  _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
+    resi = _mm_cvttps_epi32(res);
     return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
 }
 
 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
 //*************************************************************************************************
-//Intel SIMD doesn't support fixed point
+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vcvtq_n_s32_f32(_pM128(a),b));
+}
 
-int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
-uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res;
+    float convconst;
+    convconst = (float)((uint32_t)1 << b);
+    res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
+    res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
+    return res;
+}
+
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128 cconst128;
+    __m128i mask, res;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
+    mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
+}
+
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
+}
 
 //***************** Convert to float *************************
 //*************************************************************
+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_i32[0];
+    res.m64_f32[1] = (float) a.m64_i32[1];
+    return res;
+}
 
-float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_u32[0];
+    res.m64_f32[1] = (float) a.m64_u32[1];
+    return res;
+}
+
+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
 
-float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
-_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32 q0, q0
-{         //solution may be not optimal
+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
+{
+    //solution may be not optimal
     __m128 two16, fHi, fLo;
     __m128i hi, lo;
-    two16 = _mm_set1_ps((float)0x10000);         //2^16
+    two16 = _mm_set1_ps((float)0x10000); //2^16
     // Avoid double rounding by doing two exact conversions
     // of high and low 16-bit segments
     hi = _mm_srli_epi32(a, 16);
@@ -7090,24 +12806,228 @@ _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32
     return _mm_add_ps(fHi, fLo);
 }
 
+// ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_i32[0] * convconst;
+    res.m64_f32[1] = a.m64_i32[1] * convconst;
+    return res;
+}
+
+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_u32[0] * convconst;
+    res.m64_f32[1] = a.m64_u32[1] * convconst;
+    return res;
+}
+
+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    af = _mm_cvtepi32_ps(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / (1 << b));
+    af = vcvtq_f32_u32(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
 //**************Convert between floats ***********************
 //************************************************************
-
+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
 //Intel SIMD doesn't support 16bits floats curently
 
+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
 
 //************Vector narrow integer conversion (truncation) ******************
 //****************************************************************************
+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
+{
+    int8x8_t res64;
+    __m128i res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
+{
+    int16x4_t res64;
+    __m128i res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
+{
+    //may be not effective compared with a serial implementation
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
+    return64(res);
+}
+
+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+#define vmovn_u16 vmovn_s16
+
+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+#define vmovn_u32 vmovn_s32
+
+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
+#define vmovn_u64 vmovn_s64
 
 //**************** Vector long move   ***********************
 //***********************************************************
+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
+
+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
+
+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+#define vmovl_s32(a)  _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
+
+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
+
+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
+
+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
+#define vmovl_u32(a)  _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
 
 //*************Vector saturating narrow integer*****************
 //**************************************************************
+int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+_NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_packs_epi16(a, a);
+    return64(res);
+}
+
+int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+_NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_packs_epi32(a, a);
+    return64(res);
+}
+
+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
+{
+    int32x2_t res;
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
+    if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
+    if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
+    if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)atmp[0];
+    res.m64_i32[1] = (int32_t)atmp[1];
+    return res;
+}
+
+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
+{
+    //no uint16 to uint8 conversion in SSE, need truncate to max signed first
+    uint8x8_t res64;
+    __m128i c7fff, a_trunc;
+    c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
+    a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
+    a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
+    return64(a_trunc);
+}
+
+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
+{
+    //no uint32 to uint16 conversion in SSE, need truncate to max signed first
+    uint16x4_t res64;
+    __m128i c7fffffff, a_trunc;
+    c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
+    a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
+    a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
+    return64(a_trunc);
+}
 
+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i res_hi, mask;
+    mask = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a, 32);
+    res_hi = _mm_cmpeq_epi32(res_hi, mask);
+    mask = _mm_cmpeq_epi32(mask,mask); //all fff
+    mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
+    res_hi = _mm_or_si128(a, mask);
+    res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_hi);
+}
 //************* Vector saturating narrow integer signed->unsigned **************
 //*****************************************************************************
+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = _mm_packus_epi16(a, a); //use low 64bits only
+    return64(res);
+}
+
+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
+{
+    uint16x4_t res64;
+    __m128i res;
+    res = _MM_PACKUS1_EPI32(a); //use low 64bits only
+    return64(res);
+}
+
+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
+{
+    uint32x2_t res64;
+    __m128i res_hi,res_lo, zero, cmp;
+    zero = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a,  32);
+    cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
+    res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+    cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
+    res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
+    res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_lo);
+}
 
 // ********************************************************
 // **************** Table look up **************************
@@ -7115,17 +13035,218 @@ _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32
 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
 //in a table and generate a new vector. Indexes out of range return 0.
 //for Intel SIMD we need to set the MSB to 1 for zero return
+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c7, maskgt, bmask, b128;
+    c7 = _mm_set1_epi8 (7);
+    b128 = _pM128i(b);
+    maskgt = _mm_cmpgt_epi8(b128,c7);
+    bmask = _mm_or_si128(b128,maskgt);
+    bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
+    return64(bmask);
+}
+
+int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_s8 vtbl1_u8
+
+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_p8 vtbl1_u8
 
 //Special trick to avoid __declspec(align('8')) won't be aligned" error
+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c15, a01, maskgt15, bmask, b128;
+    c15 = _mm_set1_epi8 (15);
+    b128 = _pM128i(b);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    bmask = _mm_or_si128(b128, maskgt15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
+    a01 =  _mm_shuffle_epi8(a01, bmask);
+    return64(a01);
+}
+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
+
+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_s8 vtbl2_u8
+
+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_p8 vtbl2_u8
 
 //Special trick to avoid __declspec(align('16')) won't be aligned" error
+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
+    c15 = _mm_set1_epi8 (15);
+    c23 = _mm_set1_epi8 (23);
+    b128 = _pM128i(b);
+    maskgt23 = _mm_cmpgt_epi8(b128,c23);
+    bmask = _mm_or_si128(b128, maskgt23);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
+
+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_s8 vtbl3_u8
+
+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_p8 vtbl3_u8
+
+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
+    c15 = _mm_set1_epi8 (15);
+    c31 = _mm_set1_epi8 (31);
+    b128 = _pM128i(b);
+    maskgt31 = _mm_cmpgt_epi8(b128,c31);
+    bmask = _mm_or_si128(b128, maskgt31);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
+    a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
+
+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_s8 vtbl4_u8
+
+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_p8 vtbl4_u8
 
 //****************** Extended table look up intrinsics ***************************
 //**********************************************************************************
 //VTBX (Vector Table Extension) works in the same way as VTBL do,
 // except that indexes out of range leave the destination element unchanged.
 
+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c7, maskgt, sh, c128;
+    c7 = _mm_set1_epi8 (7);
+    c128 = _pM128i(c);
+    maskgt = _mm_cmpgt_epi8(c128,c7);
+    c7 = _mm_and_si128(maskgt,_pM128i(a));
+    sh = _mm_shuffle_epi8(_pM128i(b),c128);
+    sh = _mm_andnot_si128(maskgt,sh);
+    sh =  _mm_or_si128(sh,c7);
+    return64(sh);
+}
+
+int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_s8 vtbx1_u8
+
+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_p8 vtbx1_u8
+
 //Special trick to avoid __declspec(align('8')) won't be aligned" error
+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c15, b01, maskgt15, sh, c128;
+    c15 = _mm_set1_epi8 (15);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128, c15);
+    c15 = _mm_and_si128(maskgt15, _pM128i(a));
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
+    sh =  _mm_shuffle_epi8(b01, c128);
+    sh = _mm_andnot_si128(maskgt15, sh);
+    sh =  _mm_or_si128(sh,c15);
+    return64(sh);
+}
+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_s8 vtbx2_u8
+
+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_p8 vtbx2_u8
+
+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
+    c15 = _mm_set1_epi8 (15);
+    c23 = _mm_set1_epi8 (23);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    maskgt23 = _mm_cmpgt_epi8(c128,c23);
+    c23 = _mm_and_si128(maskgt23, _pM128i(a));
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_andnot_si128(maskgt23,sh0);
+    sh0 = _mm_or_si128(sh0,c23);
+    return64(sh0);
+}
+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
+#define vtbx3_s8 vtbx3_u8
+
+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
+#define vtbx3_p8 vtbx3_u8
+
+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
+    c15 = _mm_set1_epi8 (15);
+    c31 = _mm_set1_epi8 (31);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    maskgt31 = _mm_cmpgt_epi8(c128,c31);
+    c31 = _mm_and_si128(maskgt31, _pM128i(a));
+
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
+    b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_andnot_si128(maskgt31,sh0);
+    sh0 =  _mm_or_si128(sh0,c31);
+    return64(sh0);
+}
+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
+#define vtbx4_s8 vtbx4_u8
+
+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
+#define vtbx4_p8 vtbx4_u8
 
 //*************************************************************************************************
 // *************************** Operations with a scalar value *********************************
@@ -7133,83 +13254,603 @@ _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32
 
 //******* Vector multiply accumulate by scalar *************************************************
 //**********************************************************************************************
+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(v, l);
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
+
+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(v, l);
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+#define vmla_lane_u16 vmla_lane_s16
+
+
+uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+#define vmla_lane_u32 vmla_lane_s32
+
+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmla_f32(a,b,c);
+}
+
+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlaq_s16(a,b,c);
+}
+
+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlaq_s32(a,b,c);
+}
+
+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+#define vmlaq_lane_u16 vmlaq_lane_s16
+
+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+#define vmlaq_lane_u32 vmlaq_lane_s32
+
+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlaq_f32(a,b,c);
+}
 
 //***************** Vector widening multiply accumulate by scalar **********************
 //***************************************************************************************
+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlal_s16(a, b, c);
+}
+
+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlal_s32(a, b, c);
+}
+
+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdup_n_u16(vlane);
+    return vmlal_u16(a, b, c);
+}
+
+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlal_u32(a, b, c);
+}
 
 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
 // ************************************************************************************************
+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlal_s16(a, b, c);
+}
+
+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
+{
+    int32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlal_s32(a, b, c);
+}
 
 // ****** Vector multiply subtract by scalar *****************
 // *************************************************************
+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmls_s32(a, b, c);
+}
+
+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmls_u32(a, b, c);
+}
+
+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmls_f32(a,b,c);
+}
+
+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlsq_s16(a, b,c);
+}
+
+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlsq_s32(a,b,c);
+}
+
+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    uint16_t vlane;
+    uint16x8_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdupq_n_u16(vlane);
+    return vmlsq_u16(a,b,c);
+}
+
+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    uint32_t vlane;
+    uint32x4_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdupq_n_u32(vlane);
+    return vmlsq_u32(a,b,c);
+}
+
+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlsq_f32(a,b,c);
+}
 
 // **** Vector widening multiply subtract by scalar ****
 // ****************************************************
+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlsl_s16(a, b, c);
+}
+
+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlsl_s32(a, b, c);
+}
+
+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlsl_s16(a, b, c);
+}
+
+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlsl_u32(a, b, c);
+}
 
 //********* Vector widening saturating doubling multiply subtract by scalar **************************
 //******************************************************************************************************
+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlsl_s16(a, b, c);
+}
 
+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlsl_s32(a, b, c);
+}
 //********** Vector multiply with scalar *****************************
+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
+
+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(b);
+    return vmul_s32(a, b32x2);
+}
+
+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
+{
+    float32x2_t b32x2;
+    b32x2 = vdup_n_f32(b);
+    return vmul_f32(a, b32x2);
+}
 
-int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b)         // VMUL.I16 q0,q0,d0[0]
+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
+
+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(b);
+    return vmul_u32(a, b32x2);
+}
+
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
 {
     int16x8_t b16x8;
     b16x8 = vdupq_n_s16(b);
     return vmulq_s16(a, b16x8);
 }
 
-int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
-_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b)         // VMUL.I32 q0,q0,d0[0]
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
 {
     int32x4_t b32x4;
     b32x4 = vdupq_n_s32(b);
     return vmulq_s32(a, b32x4);
 }
 
-float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
-_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b)         // VMUL.F32 q0,q0,d0[0]
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
 {
     float32x4_t b32x4;
     b32x4 = vdupq_n_f32(b);
     return vmulq_f32(a, b32x4);
 }
 
-uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
-_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b)         // VMUL.I16 q0,q0,d0[0]
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
 {
     uint16x8_t b16x8;
     b16x8 = vdupq_n_s16(b);
     return vmulq_s16(a, b16x8);
 }
 
-uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
-_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b)         // VMUL.I32 q0,q0,d0[0]
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
 {
     uint32x4_t b32x4;
     b32x4 = vdupq_n_u32(b);
     return vmulq_u32(a, b32x4);
 }
 
+//********** Vector multiply lane *****************************
+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x4_t b16x4;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x4 = vdup_n_s16(vlane);
+    return vmul_s16(a, b16x4);
+}
+
+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x2_t b32x2;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x2 = vdup_n_s32(vlane);
+    return vmul_s32(a, b32x2);
+}
+
+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x2_t b32x2;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x2 = vdup_n_f32(vlane);
+    return vmul_f32(a, b32x2);
+}
+
+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmul_lane_u16 vmul_lane_s16
+
+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmul_lane_u32 vmul_lane_s32
+
+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x8_t b16x8;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x8 = vdupq_n_s16(vlane);
+    return vmulq_s16(a, b16x8);
+}
+
+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x4_t b32x4;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x4 = vdupq_n_s32(vlane);
+    return vmulq_s32(a, b32x4);
+}
+
+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x4_t b32x4;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x4 = vdupq_n_f32(vlane);
+    return vmulq_f32(a, b32x4);
+}
+
+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmulq_lane_u16 vmulq_lane_s16
+
+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmulq_lane_u32 vmulq_lane_s32
+
 //**** Vector long multiply with scalar ************
+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_s16(vec1, b16x4);
+}
+
+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
+{
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(val2);
+    return vmull_s32(vec1, b32x2);
+}
+
+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_s16(vec1, b16x4);
+}
+
+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(val2);
+    return vmull_u32(vec1, b32x2);
+}
 
 //**** Vector long multiply by scalar ****
+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
+{
+    int16_t vlane;
+    int16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_s16(vec1, b);
+}
+
+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
+{
+    int32_t vlane;
+    int32x2_t b;
+    vlane = vget_lane_s32(val2, val3);
+    b = vdup_n_s32(vlane);
+    return vmull_s32(vec1, b);
+}
+
+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_s16(vec1, b);
+}
+
+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t b;
+    vlane = vget_lane_u32(val2, val3);
+    b = vdup_n_u32(vlane);
+    return vmull_u32(vec1, b);
+}
 
 //********* Vector saturating doubling long multiply with scalar  *******************
+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
+{
+    //the serial soulution may be faster due to saturation
+    int16x4_t b;
+    b = vdup_n_s16(val2);
+    return vqdmull_s16(vec1, b);
+}
+
+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t b;
+    b = vdup_n_s32(val2);
+    return vqdmull_s32(vec1,b); //slow serial function!!!!
+}
 
 //************* Vector saturating doubling long multiply by scalar ***********************************************
+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(c);
+    return vqdmull_s16(vec1, scalar);
+}
+
+
+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(c);
+    return vqdmull_s32(vec1,scalar); //slow serial function!!!!
+}
 
 // *****Vector saturating doubling multiply high with scalar *****
+int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
+{
+    int16x4_t res64;
+    return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
+}
+
+int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
+{
+    int32x2_t res64;
+    return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
+}
 
-int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         //  VQDMULH.S16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2)         //  VQDMULH.S16 q0,q0,d0[0]
-{         //solution may be not optimal
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
     int16x8_t scalar;
     scalar = vdupq_n_s16(val2);
     return vqdmulhq_s16(vec1, scalar);
 }
 
-int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         //  VQDMULH.S32 q0,q0,d0[0]
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
 {
     int32x4_t scalar;
@@ -7218,57 +13859,185 @@ _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_
 }
 
 //***** Vector saturating doubling multiply high by scalar ****************
+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqdmulh_s16(vec1, scalar);
+}
+
+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqdmulh_s32(vec1, scalar);
+}
+
+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane );
+    return vqdmulhq_s16(vec1, scalar);
+}
+
+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqdmulhq_s32(vec1, scalar);
+}
 
 //******** Vector saturating rounding doubling multiply high with scalar ***
+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16x4_t scalar;
+    scalar = vdup_n_s16(val2);
+    return vqrdmulh_s16(vec1, scalar);
+}
+
+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(val2);
+    return vqrdmulh_s32(vec1, scalar);
+}
 
-#if defined(USE_SSSE3)
-int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2)         // VQRDMULH.S16 q0,q0,d0[0]
-{         //solution may be not optimal
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
     int16x8_t scalar;
     scalar = vdupq_n_s16(val2);
     return vqrdmulhq_s16(vec1, scalar);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
 {
     int32x4_t scalar;
     scalar = vdupq_n_s32(val2);
     return vqrdmulhq_s32(vec1, scalar);
 }
-#endif
 
 //********* Vector rounding saturating doubling multiply high by scalar  ****
+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqrdmulh_s16(vec1, scalar);
+}
+
+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqrdmulh_s32(vec1, scalar);
+}
+
+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane);
+    return vqrdmulhq_s16(vec1, scalar);
+}
+
+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqrdmulhq_s32(vec1, scalar);
+}
 
 //**************Vector multiply accumulate with scalar *******************
+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
+{
+    int16x4_t scalar;
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
+
+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+#define vmla_n_u16 vmla_n_s16
+
 
-int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLA.I16 q0, q0, d0[0]
+uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+#define vmla_n_u32 vmla_n_s32
+
+
+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
+{
+    float32x2_t scalar;
+    scalar = vdup_n_f32(c);
+    return vmla_f32(a, b, scalar);
+}
+
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
 {
     int16x8_t scalar;
     scalar = vdupq_n_s16(c);
     return vmlaq_s16(a,b,scalar);
 }
 
-int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLA.I32 q0, q0, d0[0]
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
 {
     int32x4_t scalar;
     scalar = vdupq_n_s32(c);
     return vmlaq_s32(a,b,scalar);
 }
 
-uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
 #define vmlaq_n_u16 vmlaq_n_s16
 
-uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
 #define vmlaq_n_u32 vmlaq_n_s32
 
-float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
-_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c)         // VMLA.F32 q0, q0, d0[0]
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
 {
     float32x4_t scalar;
     scalar = vdupq_n_f32(c);
@@ -7276,44 +14045,131 @@ _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t
 }
 
 //************Vector widening multiply accumulate with scalar****************************
+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlal_s16(a, b, vc);
+}
+
+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlal_s32(a, b, vc);
+}
+
+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlal_s16(a, b, vc);
+}
+
+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlal_u32(a, b, vc);
+}
 
 //************ Vector widening saturating doubling multiply accumulate with scalar **************
+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlal_s16(a, b, vc);
+}
+
+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlal_s32(a, b, vc);
+}
 
 //******** Vector multiply subtract with scalar **************
+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
 
-int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLS.I16 q0, q0, d0[0]
+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmls_s32(a, b, vc);
+}
+
+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
+
+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmls_u32(a, b, vc);
+}
+
+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
+    return res;
+}
+
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
 {
     int16x8_t vc;
     vc = vdupq_n_s16(c);
     return vmlsq_s16(a, b,vc);
 }
 
-int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLS.I32 q0, q0, d0[0]
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
 {
     int32x4_t vc;
     vc = vdupq_n_s32(c);
     return vmlsq_s32(a,b,vc);
 }
 
-uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c)         // VMLS.I16 q0, q0, d0[0]
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
 {
     uint32x4_t vc;
     vc = vdupq_n_u32(c);
     return vmlsq_u32(a,b,vc);
 }
 
-uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c)         // VMLS.I32 q0, q0, d0[0]
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
 {
     uint32x4_t vc;
     vc = vdupq_n_u32(c);
     return vmlsq_u32(a,b,vc);
 }
 
-float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
 {
     float32x4_t vc;
@@ -7322,156 +14178,353 @@ _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t
 }
 
 //**** Vector widening multiply subtract with scalar ******
+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlsl_s16(a, b, vc);
+}
+
+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlsl_s32(a, b, vc);
+}
+
+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_u16(c);
+    return vmlsl_u16(a, b, vc);
+}
+
+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlsl_u32(a, b, vc);
+}
 
 //***** Vector widening saturating doubling multiply subtract with scalar *********
 //**********************************************************************************
+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlsl_s16(a, b, vc);
+}
+
+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlsl_s32(a, b, vc);
+}
 
 //*******************  Vector extract ***********************************************
 //*************************************************************************************
 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
 //vector and the top end of the first, concatenates them, and places the result in the destination vector
 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8 - c; i++) {
+        res.m64_i8[i] = a.m64_i8[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i8[8 - c + i] = b.m64_i8[i];
+    }
+    return res;
+}
 
-#if defined(USE_SSSE3)
+uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_u8 vext_s8
 //same result tested
 
-#endif
+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_p8 vext_u8
 
-#if defined(USE_SSSE3)
-int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4 - c; i++) {
+        res.m64_i16[i] = a.m64_i16[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i16[4 - c + i] = b.m64_i16[i];
+    }
+    return res;
+}
+
+uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_u16 vext_s16
+
+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_p16 vext_s16
+
+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    if (c==0) {
+        res.m64_i32[0] = a.m64_i32[0];
+        res.m64_i32[1] = a.m64_i32[1];
+    } else {
+        res.m64_i32[0] = a.m64_i32[1];
+        res.m64_i32[1] = b.m64_i32[0];
+    }
+    return res;
+}
+
+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    float32x2_t res;
+    if (c==0) {
+        res.m64_f32[0] = a.m64_f32[0];
+        res.m64_f32[1] = a.m64_f32[1];
+    } else {
+        res.m64_f32[0] = a.m64_f32[1];
+        res.m64_f32[1] = b.m64_f32[0];
+    }
+    return res;
+}
+
+uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+#define vext_u32 vext_s32
+
+
+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_s64(a,b,c) a
+
+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_u64(a,b,c) a
+
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
 
-uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
 
-poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_p8 vextq_s8
 
-int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
 
-uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
 
-poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_p16 vextq_s16
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
 
-uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
 
-int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
+#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
+
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
 
-uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-#endif
 
 //************ Reverse vector elements (swap endianness)*****************
 //*************************************************************************
 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev64q_s8(_pM128i(vec));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec)         // VREV64.8 q0,q0
+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev64q_s16(_pM128i(vec));
+    return64(res);
+}
+
+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
+{
+    int32x2_t res;
+    res.m64_i32[0] = vec.m64_i32[1];
+    res.m64_i32[1] = vec.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_u8 vrev64_s8
+
+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_u16 vrev64_s16
+
+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+#define vrev64_u32 vrev64_s32
+
+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_p8 vrev64_u8
+
+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_p16 vrev64_u16
+
+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[1];
+    res.m64_f32[1] = vec.m64_f32[0];
+    return res;
+}
+
+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
-_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec)         // VREV64.16 q0,q0
-{         //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
+{
+    //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
 }
-#endif
 
-int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
-_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec)         // VREV64.32 q0,q0
+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
 {
     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
 #define vrev64q_u8 vrev64q_s8
 
-uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
 #define vrev64q_u16 vrev64q_s16
-#endif
 
-uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
 #define vrev64q_u32 vrev64q_s32
 
-#if defined(USE_SSSE3)
-poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
 #define vrev64q_p8 vrev64q_u8
 
-poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
-#define vrev64q_p16 vrev64q_s16
-#endif
+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+#define vrev64q_p16 vrev64q_u16
 
-float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
 
 //********************  32 bit shuffles **********************
 //************************************************************
+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev32q_s8(_pM128i(vec));
+    return64(res);
+}
+
+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev32q_s16(_pM128i(vec));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec)         // VREV32.8 q0,q0
+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_u8 vrev32_s8
+
+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_u16 vrev32_s16
+
+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_p8 vrev32_u8
+
+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_p16 vrev32_u16
+
+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
-_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec)         // VREV32.16 q0,q0
+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
 #define vrev32q_u8 vrev32q_s8
 
-uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
 #define vrev32q_u16 vrev32q_s16
 
-poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
 #define vrev32q_p8 vrev32q_u8
-#endif
+
+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+#define vrev32q_p16 vrev32q_u16
 
 //*************  16 bit shuffles **********************
 //******************************************************
+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev16q_s8(_pM128i(vec));
+    return64(res);
+}
+
+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_u8 vrev16_s8
 
-#if defined(USE_SSSE3)
-int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec)         // VREV16.8 q0,q0
+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_p8 vrev16_u8
+
+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
 #define vrev16q_u8 vrev16q_s8
 
-poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
 #define vrev16q_p8 vrev16q_u8
-#endif
 
 //*********************************************************************
 //**************** Other single operand arithmetic *******************
@@ -7479,18 +14532,56 @@ poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
 
 //*********** Absolute: Vd[i] = |Va[i]| **********************************
 //************************************************************************
+int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_abs_epi8(_pM128i(a));
+    return64(res);
+}
+
+
+int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_abs_epi16(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t   vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
+int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_abs_epi32(_pM128i(a));
+    return64(res);
+}
+
+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
+    _M64f(res64, res);
+    return res64;
+}
+
+int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
 #define vabsq_s8 _mm_abs_epi8
 
-int16x8_t   vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
+int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
 #define vabsq_s16 _mm_abs_epi16
 
-int32x4_t   vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
+int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
 #define vabsq_s32 _mm_abs_epi32
 
-float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
-_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a)         // VABS.F32 q0,q0
+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
     return _mm_and_ps (a, *(__m128*)c7fffffff);
@@ -7499,74 +14590,131 @@ _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a)         // VABS.F32 q0,q0
 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
 //**********************************************************************
 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqabsq_s8(_pM128i(a));
+    return64(res);
+}
+
+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqabsq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqabsq_s32(_pM128i(a));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
-_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a)         // VQABS.S8 q0,q0
+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
 {
     __m128i c_128, abs, abs_cmp;
-    c_128 = _mm_set1_epi8 (0x80);         //-128
+    c_128 = _mm_set1_epi8 (0x80); //-128
     abs = _mm_abs_epi8 (a);
     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
-_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a)         // VQABS.S16 q0,q0
+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
 {
     __m128i c_32768, abs, abs_cmp;
-    c_32768 = _mm_set1_epi16 (0x8000);         //-32768
+    c_32768 = _mm_set1_epi16 (0x8000); //-32768
     abs = _mm_abs_epi16 (a);
     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
-_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a)         // VQABS.S32 q0,q0
+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
 {
     __m128i c80000000, abs, abs_cmp;
-    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
     abs = _mm_abs_epi32 (a);
     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
 //*************** Negate: Vd[i] = - Va[i] *************************************
 //*****************************************************************************
 //several Negate implementations possible for SIMD.
 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vnegq_s8(_pM128i(a));
+    return64(res);
+}
+
+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vnegq_s32(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a)         // VNE//q0,q0
+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi8 (zero, a);
-}         //or _mm_sign_epi8 (a, negative numbers vector)
+} //or _mm_sign_epi8 (a, negative numbers vector)
 
-int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a)         // VNE//q0,q0
+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi16 (zero, a);
-}         //or _mm_sign_epi16 (a, negative numbers vector)
+} //or _mm_sign_epi16 (a, negative numbers vector)
 
-int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a)         // VNE//q0,q0
+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi32 (zero, a);
-}         //or _mm_sign_epi32 (a, negative numbers vector)
+} //or _mm_sign_epi32 (a, negative numbers vector)
 
-float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a)         // VNE//q0,q0
+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
     return _mm_xor_ps (a, *(__m128*) c80000000);
@@ -7575,30 +14723,57 @@ _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a)         // VNE//q0,q0
 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
 //***************************************************************************************
 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqnegq_s8(_pM128i(a));
+    return64(res);
+}
+
+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqnegq_s32(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a)         // VQNE//q0,q0
+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
-    return _mm_subs_epi8 (zero, a);         //saturating substraction
+    return _mm_subs_epi8 (zero, a); //saturating substraction
 }
 
-int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a)         // VQNE//q0,q0
+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
-    return _mm_subs_epi16 (zero, a);         //saturating substraction
+    return _mm_subs_epi16 (zero, a); //saturating substraction
 }
 
-int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a)         // VQNE//q0,q0
-{         //solution may be not optimal compared with a serial
+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
+{
+    //solution may be not optimal compared with a serial
     __m128i c80000000, zero, sub, cmp;
-    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
     zero = _mm_setzero_si128 ();
-    sub =  _mm_sub_epi32 (zero, a);         //substraction
+    sub =  _mm_sub_epi32 (zero, a); //substraction
     cmp = _mm_cmpeq_epi32 (a, c80000000);
     return _mm_xor_si128 (sub,  cmp);
 }
@@ -7606,47 +14781,79 @@ _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a)         // VQNE//q0,q0
 //****************** Count leading zeros ********************************
 //**************************************************************************
 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclzq_s8(_pM128i(a));
+    return64(res);
+}
+
+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclzq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclzq_s32(_pM128i(a));
+    return64(res);
+}
+
+
+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+#define vclz_u8 vclz_s8
 
-#if defined(USE_SSSE3)
-int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+#define vclz_u16 vclz_s16
+
+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+#define vclz_u32 vclz_s32
+
+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
 {
     _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
-                                       /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
-                                       /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
-                                       /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0};
+                                                    /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
+                                                    /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
+                                                    /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
     __m128i maskLOW, c4, lowclz, mask, hiclz;
-    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
     c4 = _mm_set1_epi8(4);
-    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a);         //uses low 4 bits anyway
-    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
-    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
-    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask);         //uses low 4 bits anyway
-    mask = _mm_cmpeq_epi8(hiclz, c4);         // shows the need to add lowclz zeros
+    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
+    mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
     lowclz = _mm_and_si128(lowclz,mask);
     return _mm_add_epi8(lowclz, hiclz);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
 {
     __m128i c7, res8x16, res8x16_swap;
     _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
     _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
-    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5);         //7
+    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
     res8x16 = vclzq_s8(a);
-    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab);         //horisontal pairs swap
-    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit);         //lowclz
-    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit);         //hiclz
-    c7 = _mm_cmpgt_epi16(res8x16_swap, c7);         // shows the need to add lowclz zeros
-    res8x16 = _mm_and_si128(res8x16, c7);         //lowclz
+    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
+    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
+    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
+    c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
+    res8x16 = _mm_and_si128(res8x16, c7); //lowclz
     return _mm_add_epi16(res8x16_swap, res8x16);
 }
-#endif
 
-int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
 {
     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
@@ -7656,49 +14863,47 @@ _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
     c3f = _mm_set1_epi32(0x3f);
     c32 = _mm_set1_epi32(32);
     tmp = _mm_srli_epi32(a, 1);
-    res = _mm_or_si128(tmp, a);         //atmp[i] |= (atmp[i] >> 1);
+    res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
     tmp = _mm_srli_epi32(res, 2);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 2);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
     tmp = _mm_srli_epi32(res, 4);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 4);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
     tmp = _mm_srli_epi32(res, 8);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 8);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
     tmp = _mm_srli_epi32(res, 16);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 16);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
 
     tmp = _mm_srli_epi32(res, 1);
     tmp = _mm_and_si128(tmp, c55555555);
-    res = _mm_sub_epi32(res, tmp);         //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
+    res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
 
     tmp = _mm_srli_epi32(res, 2);
     tmp = _mm_and_si128(tmp, c33333333);
     tmp1 = _mm_and_si128(res, c33333333);
-    res = _mm_add_epi32(tmp, tmp1);         //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
+    res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
 
     tmp = _mm_srli_epi32(res, 4);
     tmp = _mm_add_epi32(tmp, res);
-    res = _mm_and_si128(tmp, c0f0f0f0f);         //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
+    res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
 
     tmp = _mm_srli_epi32(res, 8);
-    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 8);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
 
     tmp = _mm_srli_epi32(res, 16);
-    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 16);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
 
-    res = _mm_and_si128(res, c3f);         //atmp[i] = atmp[i] & 0x0000003f;
+    res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
 
-    return _mm_sub_epi32(c32, res);         //res[i] = 32 - atmp[i];
+    return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
 #define vclzq_u8 vclzq_s8
 
-uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
 #define vclzq_u16 vclzq_s16
-#endif
 
-uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
 #define vclzq_u32 vclzq_s32
 
 //************** Count leading sign bits **************************
@@ -7707,17 +14912,42 @@ uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
 // the topmost bit, that are the same as the topmost bit, in each element in a vector
 //No corresponding vector intrinsics in IA32, need to implement it.
 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclsq_s8(_pM128i(a));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclsq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclsq_s32(_pM128i(a));
+    return64(res);
+}
+
+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
 {
     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
-    cff = _mm_cmpeq_epi8 (a,a);         //0xff
+    cff = _mm_cmpeq_epi8 (a,a); //0xff
     c80 = _mm_set1_epi8(0x80);
     c1 = _mm_set1_epi8(1);
     a_mask = _mm_and_si128(a, c80);
-    a_mask = _mm_cmpeq_epi8(a_mask, c80);         //0xff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7725,18 +14955,16 @@ _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
     a_comb = vclzq_s8(a_comb);
     return _mm_sub_epi8(a_comb, c1);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
 {
     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
     cffff = _mm_cmpeq_epi16(a,a);
-    c8000 =  _mm_slli_epi16(cffff, 15);         //0x8000
-    c1 = _mm_srli_epi16(cffff,15);         //0x1
+    c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
+    c1 = _mm_srli_epi16(cffff,15); //0x1
     a_mask = _mm_and_si128(a, c8000);
-    a_mask = _mm_cmpeq_epi16(a_mask, c8000);         //0xffff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cffff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7744,17 +14972,16 @@ _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
     a_comb = vclzq_s16(a_comb);
     return _mm_sub_epi16(a_comb, c1);
 }
-#endif
 
-int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
 {
     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
     cffffffff = _mm_cmpeq_epi32(a,a);
-    c80000000 =  _mm_slli_epi32(cffffffff, 31);         //0x80000000
-    c1 = _mm_srli_epi32(cffffffff,31);         //0x1
+    c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
+    c1 = _mm_srli_epi32(cffffffff,31); //0x1
     a_mask = _mm_and_si128(a, c80000000);
-    a_mask = _mm_cmpeq_epi32(a_mask, c80000000);         //0xffffffff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cffffffff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7768,261 +14995,488 @@ _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
 //another option is to do the following algorithm:
 
-#if defined(USE_SSSE3)
-uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = vcntq_u8(_pM128i(a));
+    return64(res);
+}
+
+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+#define vcnt_s8 vcnt_u8
+
+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+#define vcnt_p8 vcnt_u8
+
+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
 {
     _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
-                                            /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
-                                            /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
-                                            /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
+                                                        /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
+                                                        /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
+                                                        /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4                                   };
     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
-    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, need masking to avoid zero if MSB is set
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
     mask = _mm_and_si128(a, maskLOW);
-    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask);         //uses low 4 bits anyway
-    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
-    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
-    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask);         //uses low 4 bits anyway
+    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
     return _mm_add_epi8(lowpopcnt, hipopcnt);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
 #define vcntq_s8 vcntq_u8
 
-poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
 #define vcntq_p8 vcntq_u8
-#endif
 
 //**************************************************************************************
 //*********************** Logical operations ****************************************
 //**************************************************************************************
 //************************** Bitwise not ***********************************
 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vmvnq_s8(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a)         // VMVN q0,q0
+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vmvnq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vmvnq_s32(_pM128i(a));
+    return64(res);
+}
+
+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+#define vmvn_u8 vmvn_s8
+
+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+#define vmvn_u16 vmvn_s16
+
+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+#define vmvn_u32 vmvn_s32
+
+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+#define vmvn_p8 vmvn_u8
+
+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi8 (a,a);         //0xff
+    c1 = _mm_cmpeq_epi8 (a,a); //0xff
     return _mm_andnot_si128 (a, c1);
 }
 
-int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a)         // VMVN q0,q0
+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi16 (a,a);         //0xffff
+    c1 = _mm_cmpeq_epi16 (a,a); //0xffff
     return _mm_andnot_si128 (a, c1);
 }
 
-int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a)         // VMVN q0,q0
+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
+    c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
     return _mm_andnot_si128 (a, c1);
 }
 
-uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
 #define vmvnq_u8 vmvnq_s8
 
-uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
 #define vmvnq_u16 vmvnq_s16
 
-uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
 #define vmvnq_u32 vmvnq_s32
 
-poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
 #define vmvnq_p8 vmvnq_u8
 
 //****************** Bitwise and ***********************
 //******************************************************
+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+#define vand_u8 vand_s8
+
+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+#define vand_u16 vand_s16
 
-int8x16_t   vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+#define vand_u32 vand_s32
+
+uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
+#define vand_u64 vand_s64
+
+
+int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
 #define vandq_s8 _mm_and_si128
 
-int16x8_t   vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
+int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
 #define vandq_s16 _mm_and_si128
 
-int32x4_t   vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
+int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
 #define vandq_s32 _mm_and_si128
 
-int64x2_t   vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
+int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
 #define vandq_s64 _mm_and_si128
 
-uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
+uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
 #define vandq_u8 _mm_and_si128
 
-uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
+uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
 #define vandq_u16 _mm_and_si128
 
-uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
+uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
 #define vandq_u32 _mm_and_si128
 
-uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
 #define vandq_u64 _mm_and_si128
 
 //******************** Bitwise or *********************************
 //******************************************************************
+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
 
-int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+#define vorr_u8 vorr_s8
+
+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+#define vorr_u16 vorr_s16
+
+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+#define vorr_u32 vorr_s32
+
+uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
+#define vorr_u64 vorr_s64
+
+int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
 #define vorrq_s8 _mm_or_si128
 
-int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
+int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
 #define vorrq_s16 _mm_or_si128
 
-int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
+int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
 #define vorrq_s32 _mm_or_si128
 
-int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
+int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
 #define vorrq_s64 _mm_or_si128
 
-uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
+uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
 #define vorrq_u8 _mm_or_si128
 
-uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
+uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
 #define vorrq_u16 _mm_or_si128
 
-uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
+uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
 #define vorrq_u32 _mm_or_si128
 
-uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
 #define vorrq_u64 _mm_or_si128
 
 //************* Bitwise exclusive or (EOR or XOR) ******************
 //*******************************************************************
+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
+}
+
+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+#define veor_s16 veor_s8
+
+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+#define veor_s32 veor_s8
+
+int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+#define veor_u8 veor_s8
+
+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+#define veor_u16 veor_s16
 
-int8x16_t   veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+#define veor_u32 veor_s32
+
+uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
+#define veor_u64 veor_s64
+
+int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
 #define veorq_s8 _mm_xor_si128
 
-int16x8_t   veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
+int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
 #define veorq_s16 _mm_xor_si128
 
-int32x4_t   veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
+int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
 #define veorq_s32 _mm_xor_si128
 
-int64x2_t   veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
+int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
 #define veorq_s64 _mm_xor_si128
 
-uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
+uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
 #define veorq_u8 _mm_xor_si128
 
-uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
+uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
 #define veorq_u16 _mm_xor_si128
 
-uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
+uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
 #define veorq_u32 _mm_xor_si128
 
-uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
 #define veorq_u64 _mm_xor_si128
 
 //********************** Bit Clear **********************************
 //*******************************************************************
 //Logical AND complement (AND negation or AND NOT)
+int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
+}
 
-//notice arguments "swap"
-
-//notice arguments "swap"
+int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+#define vbic_s16 vbic_s8
 
-//notice arguments "swap"
+int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+#define vbic_s32 vbic_s8
 
-//notice arguments "swap"
+int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
+    return res;
+}
 
-//notice arguments "swap"
+uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+#define vbic_u8 vbic_s8
 
-//notice arguments "swap"
+uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+#define vbic_u16 vbic_s16
 
-//notice arguments "swap"
+uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+#define vbic_u32 vbic_s32
 
-//notice arguments "swap"
+uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+#define vbic_u64 vbic_s64
 
-int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
-#define vbicq_s8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
-#define vbicq_s16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
-#define vbicq_s32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
-#define vbicq_s64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
-#define vbicq_u8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
-#define vbicq_u16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
-#define vbicq_u32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
-#define vbicq_u64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
 //**************** Bitwise OR complement ********************************
 //**************************************** ********************************
 //no exact IA 32 match, need to implement it as following
+int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vornq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vornq_s16(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b)         // VORN q0,q0,q0
+
+int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vornq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
+    return res;
+}
+
+uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
+#define vorn_u8 vorn_s8
+
+
+uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
+#define vorn_u16 vorn_s16
+
+uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
+#define vorn_u32 vorn_s32
+
+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+#define vorn_u64 vorn_s64
+
+
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s8( b);         //bitwise not for b
+    b1 = vmvnq_s8( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b)         // VORN q0,q0,q0
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s16( b);         //bitwise not for b
+    b1 = vmvnq_s16( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b)         // VORN q0,q0,q0
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s32( b);         //bitwise not for b
+    b1 = vmvnq_s32( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
 {
     __m128i c1, b1;
-    c1 = _mm_cmpeq_epi8 (a, a);         //all ones 0xfffffff...fffff
+    c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
     b1 = _mm_andnot_si128 (b, c1);
     return _mm_or_si128 (a, b1);
 }
 
-uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b)         // VORN q0,q0,q0
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_u8( b);         //bitwise not for b
+    b1 = vmvnq_u8( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b)         // VORN q0,q0,q0
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s16( b);         //bitwise not for b
+    b1 = vmvnq_s16( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b)         // VORN q0,q0,q0
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_u32( b);         //bitwise not for b
+    b1 = vmvnq_u32( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
-uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
 #define vornq_u64 vornq_s64
 
 //********************* Bitwise Select *****************************
@@ -8039,9 +15493,61 @@ uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
 
 //VBSL only is implemented for SIMD
+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
+    return64(res);
+}
 
-int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c)         // VBSL q0,q0,q0
+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+#define vbsl_s16 vbsl_s8
+
+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+#define vbsl_s32 vbsl_s8
+
+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
+    return res;
+}
+
+uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+#define vbsl_u8 vbsl_s8
+
+uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+#define vbsl_u16 vbsl_s8
+
+uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+#define vbsl_u32 vbsl_s8
+
+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+#define vbsl_u64 vbsl_s64
+
+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 sel1, sel2;
+    __m64_128 res64;
+    sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
+    sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
+    sel1 = _mm_or_ps (sel1, sel2);
+    _M64f(res64, sel1);
+    return res64;
+}
+
+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+#define  vbsl_p8 vbsl_s8
+
+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+#define  vbsl_p16 vbsl_s8
+
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
 {
     __m128i sel1, sel2;
     sel1 = _mm_and_si128   (a, b);
@@ -8049,29 +15555,29 @@ _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c)
     return _mm_or_si128 (sel1, sel2);
 }
 
-int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
 #define vbslq_s16 vbslq_s8
 
-int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
 #define vbslq_s32 vbslq_s8
 
-int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
 #define vbslq_s64 vbslq_s8
 
-uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
 #define vbslq_u8 vbslq_s8
 
-uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
 #define vbslq_u16 vbslq_s8
 
-uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
 #define vbslq_u32 vbslq_s8
 
-uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
 #define vbslq_u64 vbslq_s8
 
-float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c)         // VBSL q0,q0,q0
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
 {
     __m128 sel1, sel2;
     sel1 = _mm_and_ps   (*(__m128*)&a, b);
@@ -8079,10 +15585,10 @@ _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t
     return _mm_or_ps (sel1, sel2);
 }
 
-poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
 #define vbslq_p8 vbslq_u8
 
-poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
 #define vbslq_p16 vbslq_s8
 
 //************************************************************************************
@@ -8092,89 +15598,182 @@ poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL
 //************************************************************************************
 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
+    vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
+    return val;
+}
+
+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
+    return val;
+}
+
+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
+    return val;
+}
+
+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+#define vtrn_u8 vtrn_s8
+
+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+#define vtrn_u16 vtrn_s16
+
+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+#define vtrn_u32 vtrn_s32
+
+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2x2_t val;
+    val.val[0].m64_f32[0] = a.m64_f32[0];
+    val.val[0].m64_f32[1] = b.m64_f32[0];
+    val.val[1].m64_f32[0] = a.m64_f32[1];
+    val.val[1].m64_f32[1] = b.m64_f32[1];
+    return val; //a0,b0,a1,b1
+}
+
+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+#define  vtrn_p8 vtrn_u8
+
+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+#define  vtrn_p16 vtrn_s16
 
-#if defined(USE_SSSE3)
 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b)         // VTRN.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
 {
     int8x16x2_t r8x16;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
 
-    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh);         //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
-    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh);         // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
+    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
+    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
     return r8x16;
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b)         // VTRN.16 q0,q0
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
 {
     int16x8x2_t v16x8;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
-    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh);         //a0, b0, a2, b2, a4, b4, a6, b6
-    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh);         //a1, b1, a3, b3, a5, b5, a7, b7
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
+    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
     return v16x8;
 }
-#endif
 
-int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b)         // VTRN.32 q0,q0
-{         //may be not optimal solution compared with serial
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     int32x4x2_t v32x4;
     __m128i a_sh, b_sh;
-    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
-    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
 
-    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh);         //a0, b0, a2, b2
-    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh);         //a1, b1, a3,  b3
+    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
+    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
 #define vtrnq_u8 vtrnq_s8
 
-uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
 #define vtrnq_u16 vtrnq_s16
-#endif
 
-uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
 #define vtrnq_u32 vtrnq_s32
 
-float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b)         // VTRN.32 q0,q0
-{         //may be not optimal solution compared with serial
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     float32x4x2_t f32x4;
     __m128 a_sh, b_sh;
-    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0));         //a0, a2, a1, a3, need to check endiness
-    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0));         //b0, b2, b1, b3, need to check endiness
+    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
+    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
 
-    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh);         //a0, b0, a2, b2
-    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh);         //a1, b1, a3,  b3
+    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
+    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
     return f32x4;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
 #define vtrnq_p8 vtrnq_s8
 
-poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
 #define vtrnq_p16 vtrnq_s16
-#endif
 
 //***************** Interleave elements ***************************
 //*****************************************************************
 //output has (a0,b0,a1,b1, a2,b2,.....)
+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
+    vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
 
-int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b)         // VZIP.8 q0,q0
+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+#define vzip_s32 vtrn_s32
+
+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+#define vzip_u8 vzip_s8
+
+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+#define vzip_u16 vzip_s16
+
+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+#define vzip_u32 vzip_s32
+
+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+#define vzip_f32 vtrn_f32
+
+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+#define vzip_p8 vzip_u8
+
+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+#define vzip_p16 vzip_u16
+
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
 {
     int8x16x2_t r8x16;
     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
@@ -8182,8 +15781,8 @@ _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b)         // VZIP.
     return r8x16;
 }
 
-int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b)         // VZIP.16 q0,q0
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
 {
     int16x8x2_t r16x8;
     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
@@ -8191,8 +15790,8 @@ _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b)         // VZIP
     return r16x8;
 }
 
-int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b)         // VZIP.32 q0,q0
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
 {
     int32x4x2_t r32x4;
     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
@@ -8200,17 +15799,17 @@ _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b)         // VZIP
     return r32x4;
 }
 
-uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
 #define vzipq_u8 vzipq_s8
 
-uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
 #define vzipq_u16 vzipq_s16
 
-uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
 #define vzipq_u32 vzipq_s32
 
-float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b)         // VZIP.32 q0,q0
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
 {
     float32x4x2_t f32x4;
     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
@@ -8218,93 +15817,166 @@ _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b)         /
     return f32x4;
 }
 
-poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
 #define vzipq_p8 vzipq_u8
 
-poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
 #define vzipq_p16 vzipq_u16
 
 //*********************** De-Interleave elements *************************
 //*************************************************************************
 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
 //no such functions in IA32 SIMD, shuffle is required
+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
+    vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
 
-#if defined(USE_SSSE3)
-int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b)         // VUZP.8 q0,q0
+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+#define vuzp_u8 vuzp_s8
+
+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+#define vuzp_u16 vuzp_s16
+
+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+#define vuzp_u32 vuzp_s32
+
+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+#define vuzp_f32 vzip_f32
+
+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+#define vuzp_p8 vuzp_u8
+
+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+#define vuzp_p16 vuzp_u16
+
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
 {
     int8x16x2_t v8x16;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
-    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
-    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
+    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
+    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
     return v8x16;
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b)         // VUZP.16 q0,q0
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
 {
     int16x8x2_t v16x8;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
-    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, a4, a6, b0, b2, b4, b6
-    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, b1, b3, b5, b7
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
+    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
     return v16x8;
 }
-#endif
 
-int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b)         // VUZP.32 q0,q0
-{         //may be not optimal solution compared with serial
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     int32x4x2_t v32x4;
     __m128i a_sh, b_sh;
-    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
-    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
 
-    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, b0, b2
-    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, b1, b3
+    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
+    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
 #define vuzpq_u8 vuzpq_s8
 
-uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
 #define vuzpq_u16 vuzpq_s16
-#endif
 
-uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
 #define vuzpq_u32 vuzpq_s32
 
-float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b)         // VUZP.32 q0,q0
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
 {
     float32x4x2_t v32x4;
-    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0));         //a0, a2, b0, b2 , need to check endianess however
-    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1));         //a1, a3, b1, b3, need to check endianess however
+    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
+    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
 #define vuzpq_p8 vuzpq_u8
 
-poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
 #define vuzpq_p16 vuzpq_u16
-#endif
 
 //##############################################################################################
 //*********************** Reinterpret cast intrinsics.******************************************
 //##############################################################################################
 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
+#define vreinterpret_p8_u32
+
+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
+#define vreinterpret_p8_u16
+
+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
+#define vreinterpret_p8_u8
+
+poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
+#define vreinterpret_p8_s32
+
+poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
+#define vreinterpret_p8_s16
+
+poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
+#define vreinterpret_p8_s8
+
+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
+#define vreinterpret_p8_u64
+
+poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
+#define vreinterpret_p8_s64
+
+poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
+#define vreinterpret_p8_f32
+
+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
+#define vreinterpret_p8_p16
 
 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
 #define vreinterpretq_p8_u32
@@ -8336,6 +16008,36 @@ poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
 #define vreinterpretq_p8_p16
 
+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
+#define vreinterpret_p16_u32
+
+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
+#define vreinterpret_p16_u16
+
+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
+#define vreinterpret_p16_u8
+
+poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
+#define vreinterpret_p16_s32
+
+poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
+#define vreinterpret_p16_s16
+
+poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
+#define vreinterpret_p16_s8
+
+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
+#define vreinterpret_p16_u64
+
+poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
+#define vreinterpret_p16_s64
+
+poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
+#define vreinterpret_p16_f32
+
+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
+#define vreinterpret_p16_p8
+
 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
 #define vreinterpretq_p16_u32
 
@@ -8364,6 +16066,42 @@ poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
 
 //****  Integer to float  ******
+float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
+
+
+float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
+#define vreinterpret_f32_u16 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
+#define vreinterpret_f32_u8 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s32 (int32x2_t t);
+#define vreinterpret_f32_s32 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s16 (int16x4_t t);
+#define vreinterpret_f32_s16 vreinterpret_f32_u32
+
+float32x2_t vreinterpret_f32_s8 (int8x8_t t);
+#define vreinterpret_f32_s8 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_u64(uint64x1_t t);
+#define vreinterpret_f32_u64 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s64 (int64x1_t t);
+#define vreinterpret_f32_s64 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
+#define vreinterpret_f32_p16 vreinterpret_f32_u32
+
+float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
+#define vreinterpret_f32_p8 vreinterpret_f32_u32
 
 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
 #define  vreinterpretq_f32_u32(t) *(__m128*)&(t)
@@ -8397,6 +16135,35 @@ float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
 
 //*** Integer type conversions ******************
 //no conversion necessary for the following functions because it is same data type
+int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
+#define vreinterpret_s64_u32
+
+int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
+#define vreinterpret_s64_u16
+
+int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
+#define vreinterpret_s64_u8
+
+int64x1_t vreinterpret_s64_s32 (int32x2_t t);
+#define  vreinterpret_s64_s32
+
+int64x1_t vreinterpret_s64_s16 (int16x4_t t);
+#define vreinterpret_s64_s16
+
+int64x1_t vreinterpret_s64_s8 (int8x8_t t);
+#define  vreinterpret_s64_s8
+
+int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
+#define  vreinterpret_s64_u64
+
+int64x1_t vreinterpret_s64_f32 (float32x2_t t);
+#define  vreinterpret_s64_f32
+
+int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
+#define vreinterpret_s64_p16
+
+int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
+#define vreinterpret_s64_p8
 
 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
 #define vreinterpretq_s64_u32
@@ -8428,6 +16195,36 @@ int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
 #define vreinterpretq_s64_p8
 
+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
+#define vreinterpret_u64_u32
+
+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
+#define vreinterpret_u64_u16
+
+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
+#define vreinterpret_u64_u8
+
+uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
+#define vreinterpret_u64_s32
+
+uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
+#define vreinterpret_u64_s16
+
+uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
+#define vreinterpret_u64_s8
+
+uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
+#define vreinterpret_u64_s64
+
+uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
+#define vreinterpret_u64_f32
+
+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
+#define vreinterpret_u64_p16
+
+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
+#define vreinterpret_u64_p8
+
 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
 #define vreinterpretq_u64_u32
 
@@ -8458,6 +16255,36 @@ uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
 #define vreinterpretq_u64_p8
 
+int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
+#define vreinterpret_s8_u32
+
+int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
+#define vreinterpret_s8_u16
+
+int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
+#define vreinterpret_s8_u8
+
+int8x8_t vreinterpret_s8_s32 (int32x2_t t);
+#define vreinterpret_s8_s32
+
+int8x8_t vreinterpret_s8_s16 (int16x4_t t);
+#define vreinterpret_s8_s16
+
+int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
+#define vreinterpret_s8_u64
+
+int8x8_t vreinterpret_s8_s64 (int64x1_t t);
+#define vreinterpret_s8_s64
+
+int8x8_t vreinterpret_s8_f32 (float32x2_t t);
+#define vreinterpret_s8_f32
+
+int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
+#define vreinterpret_s8_p16
+
+int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
+#define vreinterpret_s8_p8
+
 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
 #define vreinterpretq_s8_u32
 
@@ -8488,6 +16315,37 @@ int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
 #define vreinterpretq_s8_p8
 
+int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
+#define vreinterpret_s16_u32
+
+int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
+#define vreinterpret_s16_u16
+
+int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
+#define vreinterpret_s16_u8
+
+int16x4_t vreinterpret_s16_s32 (int32x2_t t);
+#define vreinterpret_s16_s32
+
+int16x4_t vreinterpret_s16_s8 (int8x8_t t);
+#define vreinterpret_s16_s8
+
+int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
+#define vreinterpret_s16_u64
+
+int16x4_t vreinterpret_s16_s64 (int64x1_t t);
+#define vreinterpret_s16_s64
+
+int16x4_t vreinterpret_s16_f32 (float32x2_t t);
+#define vreinterpret_s16_f32
+
+
+int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
+#define vreinterpret_s16_p16
+
+int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
+#define vreinterpret_s16_p8
+
 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
 #define vreinterpretq_s16_u32
 
@@ -8518,6 +16376,36 @@ int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
 #define vreinterpretq_s16_p8
 
+int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
+#define vreinterpret_s32_u32
+
+int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
+#define vreinterpret_s32_u16
+
+int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
+#define vreinterpret_s32_u8
+
+int32x2_t vreinterpret_s32_s16 (int16x4_t t);
+#define vreinterpret_s32_s16
+
+int32x2_t vreinterpret_s32_s8 (int8x8_t t);
+#define vreinterpret_s32_s8
+
+int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
+#define vreinterpret_s32_u64
+
+int32x2_t vreinterpret_s32_s64 (int64x1_t t);
+#define vreinterpret_s32_s64
+
+int32x2_t vreinterpret_s32_f32 (float32x2_t t);
+#define vreinterpret_s32_f32
+
+int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
+#define vreinterpret_s32_p16
+
+int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
+#define vreinterpret_s32_p8
+
 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
 #define vreinterpretq_s32_u32
 
@@ -8540,7 +16428,7 @@ int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
 #define vreinterpretq_s32_s64
 
 int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
-#define vreinterpretq_s32_f32(t)  _mm_castps_si128(t)         //(*(__m128i*)&(t))
+#define vreinterpretq_s32_f32(t)  _mm_castps_si128(t) //(*(__m128i*)&(t))
 
 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
 #define vreinterpretq_s32_p16
@@ -8548,6 +16436,36 @@ int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
 #define vreinterpretq_s32_p8
 
+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
+#define vreinterpret_u8_u32
+
+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
+#define vreinterpret_u8_u16
+
+uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
+#define vreinterpret_u8_s32
+
+uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
+#define vreinterpret_u8_s16
+
+uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
+#define vreinterpret_u8_s8
+
+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
+#define vreinterpret_u8_u64
+
+uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
+#define vreinterpret_u8_s64
+
+uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
+#define vreinterpret_u8_f32
+
+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
+#define vreinterpret_u8_p16
+
+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
+#define vreinterpret_u8_p8
+
 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
 #define vreinterpretq_u8_u32
 
@@ -8572,12 +16490,43 @@ uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
 #define vreinterpretq_u8_f32(t) _M128i(t)
 
+
 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
 #define vreinterpretq_u8_p16
 
 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
 #define vreinterpretq_u8_p8
 
+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
+#define vreinterpret_u16_u32
+
+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
+#define vreinterpret_u16_u8
+
+uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
+#define vreinterpret_u16_s32
+
+uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
+#define vreinterpret_u16_s16
+
+uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
+#define vreinterpret_u16_s8
+
+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
+#define vreinterpret_u16_u64
+
+uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
+#define vreinterpret_u16_s64
+
+uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
+#define vreinterpret_u16_f32
+
+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
+#define vreinterpret_u16_p16
+
+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
+#define vreinterpret_u16_p8
+
 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
 #define vreinterpretq_u16_u32
 
@@ -8608,6 +16557,36 @@ uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
 #define vreinterpretq_u16_p8
 
+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
+#define vreinterpret_u32_u16
+
+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
+#define vreinterpret_u32_u8
+
+uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
+#define vreinterpret_u32_s32
+
+uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
+#define vreinterpret_u32_s16
+
+uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
+#define vreinterpret_u32_s8
+
+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
+#define vreinterpret_u32_u64
+
+uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
+#define vreinterpret_u32_s64
+
+uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
+#define vreinterpret_u32_f32
+
+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
+#define vreinterpret_u32_p16
+
+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
+#define vreinterpret_u32_p8
+
 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
 #define vreinterpretq_u32_u16
 
diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h
index 21b9e9692d8..7f59c0b23b4 100644
--- a/gcc/config/i386/gnu-user.h
+++ b/gcc/config/i386/gnu-user.h
@@ -65,6 +65,10 @@ along with GCC; see the file COPYING3.  If not see
    When the -shared link option is used a final link is not being
    done.  */
 
+#undef ANDROID_TARGET_CC1_SPEC
+#define ANDROID_TARGET_CC1_SPEC \
+  " -mssse3 -fno-short-enums " \
+
 #undef  ASM_SPEC
 #define ASM_SPEC \
   "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}} " \
diff --git a/gcc/config/i386/gnu-user64.h b/gcc/config/i386/gnu-user64.h
index 1c72b41e43e..39d13d1d1d7 100644
--- a/gcc/config/i386/gnu-user64.h
+++ b/gcc/config/i386/gnu-user64.h
@@ -46,6 +46,11 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define SPEC_X32 "mx32"
 #endif
 
+#undef ANDROID_TARGET_CC1_SPEC
+#define ANDROID_TARGET_CC1_SPEC \
+  "%{m32:-mssse3 -fno-short-enums}" \
+  "%{!m32:-msse4.2 -mpopcnt}"
+
 #undef ASM_SPEC
 #define ASM_SPEC "%{" SPEC_32 ":--32} \
  %{" SPEC_64 ":--64} \
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2c917e6cd44..54942d52080 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -82,6 +82,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "context.h"
 #include "pass_manager.h"
 #include "target-globals.h"
+#include "tree-vectorizer.h"
 
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
@@ -1739,7 +1740,7 @@ struct processor_costs slm_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -1816,7 +1817,7 @@ struct processor_costs intel_cost = {
   1,					/* scalar load_cost.  */
   1,					/* scalar_store_cost.  */
   1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
+  4,					/* vec_to_scalar_cost.  */
   1,					/* scalar_to_vec_cost.  */
   1,					/* vec_align_load_cost.  */
   2,					/* vec_unalign_load_cost.  */
@@ -13007,7 +13008,13 @@ legitimate_pic_address_disp_p (rtx disp)
 		   && (SYMBOL_REF_LOCAL_P (op0)
 		       || (HAVE_LD_PIE_COPYRELOC
 			   && flag_pie
-			   && !SYMBOL_REF_WEAK (op0)
+			   && !(SYMBOL_REF_WEAK (op0)
+		  /* TODO:Temporary fix for weak defined symbols. Weak defined
+		     symbols in an executable cannot be overridden even with
+		     a non-weak symbol in a shared library.
+		     Revert after fix is checked in here:
+		     http://gcc.gnu.org/ml/gcc-patches/2015-02/msg00366.html*/
+				&& SYMBOL_REF_EXTERNAL_P (op0))
 			   && !SYMBOL_REF_FUNCTION_P (op0)))
 		   && ix86_cmodel != CM_LARGE_PIC)
 	    return true;
@@ -25206,13 +25213,19 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
     }
   else
     {
-      /* Static functions and indirect calls don't need the pic register.  */
+      /* Static functions and indirect calls don't need the pic register.  Also,
+	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
+	 it an indirect call.  */
       if (flag_pic
 	  && (!TARGET_64BIT
 	      || (ix86_cmodel == CM_LARGE_PIC
 		  && DEFAULT_ABI != MS_ABI))
 	  && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
-	  && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
+	  && !SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0))
+	  && flag_plt
+	  && (SYMBOL_REF_DECL ((XEXP (fnaddr, 0))) == NULL_TREE
+	      || !lookup_attribute ("noplt",
+		     DECL_ATTRIBUTES (SYMBOL_REF_DECL (XEXP (fnaddr, 0))))))
 	use_reg (&use, pic_offset_table_rtx);
     }
 
@@ -25276,6 +25289,31 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   return call;
 }
 
+/* Return true if the function being called was marked with attribute "noplt"
+   or using -fno-plt and we are compiling for non-PIC and x86_64.  We need to
+   handle the non-PIC case in the backend because there is no easy interface
+   for the front-end to force non-PLT calls to use the GOT.  This is currently
+   used only with 64-bit ELF targets to call the function marked "noplt"
+   indirectly.  */
+
+static bool
+ix86_nopic_noplt_attribute_p (rtx call_op)
+{
+  if (flag_pic || ix86_cmodel == CM_LARGE
+      || !TARGET_64BIT || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
+      || SYMBOL_REF_LOCAL_P (call_op))
+    return false;
+
+  tree symbol_decl = SYMBOL_REF_DECL (call_op);
+
+  if (!flag_plt
+      || (symbol_decl != NULL_TREE
+          && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
+    return true;
+
+  return false;
+}
+
 /* Output the assembly for a call instruction.  */
 
 const char *
@@ -25287,7 +25325,9 @@ ix86_output_call_insn (rtx insn, rtx call_op)
 
   if (SIBLING_CALL_P (insn))
     {
-      if (direct_p)
+      if (direct_p && ix86_nopic_noplt_attribute_p (call_op))
+	xasm = "jmp\t*%p0@GOTPCREL(%%rip)";
+      else if (direct_p)
 	xasm = "jmp\t%P0";
       /* SEH epilogue detection requires the indirect branch case
 	 to include REX.W.  */
@@ -25339,7 +25379,9 @@ ix86_output_call_insn (rtx insn, rtx call_op)
 	seh_nop_p = true;
     }
 
-  if (direct_p)
+  if (direct_p && ix86_nopic_noplt_attribute_p (call_op))
+    xasm = "call\t*%p0@GOTPCREL(%%rip)";
+  else if (direct_p)
     xasm = "call\t%P0";
   else
     xasm = "call\t%A0";
@@ -44334,6 +44376,127 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two "and" and "pack" or two "shift" and "pack" insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t, rperm[16];
+  unsigned i, odd, c, s, nelt = d->nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  if (d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case V8HImode:
+      /* Required for "pack".  */
+      if (!TARGET_SSE4_1)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V4SImode;
+      gen_and = gen_andv4si3;
+      gen_pack = gen_sse4_1_packusdw;
+      gen_shift = gen_lshrv4si3;
+      break;
+    case V16QImode:
+      /* No check as all instructions are SSE2.  */
+      c = 0xff;
+      s = 8;
+      half_mode = V8HImode;
+      gen_and = gen_andv8hi3;
+      gen_pack = gen_sse2_packuswb;
+      gen_shift = gen_lshrv8hi3;
+      break;
+    case V16HImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V8SImode;
+      gen_and = gen_andv8si3;
+      gen_pack = gen_avx2_packusdw;
+      gen_shift = gen_lshrv8si3;
+      end_perm = true;
+      break;
+    case V32QImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xff;
+      s = 8;
+      half_mode = V16HImode;
+      gen_and = gen_andv16hi3;
+      gen_pack = gen_avx2_packuswb;
+      gen_shift = gen_lshrv16hi3;
+      end_perm = true;
+      break;
+    default:
+      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+	 general shuffles.  */
+      return false;
+    }
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+    {
+      for (i = 0; i < nelt / 2; i++)
+	rperm[i] = GEN_INT (c);
+      t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+      t = force_reg (half_mode, t);
+      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+    }
+  else
+    {
+      emit_insn (gen_shift (dop0,
+			    gen_lowpart (half_mode, d->op0),
+			    GEN_INT (s)));
+      emit_insn (gen_shift (dop1,
+			    gen_lowpart (half_mode, d->op1),
+			    GEN_INT (s)));
+    }
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2 && end_perm)
+    {
+      op = gen_reg_rtx (d->vmode);
+      t = gen_reg_rtx (V4DImode);
+      emit_insn (gen_pack (op, dop0, dop1));
+      emit_insn (gen_avx2_permv4di_1 (t,
+				      gen_lowpart (V4DImode, op),
+				      const0_rtx,
+				      const2_rtx,
+				      const1_rtx,
+				      GEN_INT (3)));
+      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+    }
+  else
+    emit_insn (gen_pack (d->target, dop0, dop1));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */
 
@@ -44405,7 +44568,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       gcc_unreachable ();
 
     case V8HImode:
-      if (TARGET_SSSE3)
+      if (TARGET_SSE4_1)
+	return expand_vec_perm_even_odd_pack (d);
+      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
 	return expand_vec_perm_pshufb2 (d);
       else
 	{
@@ -44428,32 +44593,11 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
       break;
 
     case V16QImode:
-      if (TARGET_SSSE3)
-	return expand_vec_perm_pshufb2 (d);
-      else
-	{
-	  if (d->testing_p)
-	    break;
-	  t1 = gen_reg_rtx (V16QImode);
-	  t2 = gen_reg_rtx (V16QImode);
-	  t3 = gen_reg_rtx (V16QImode);
-	  emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
-	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
-	  emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
-	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
-	  emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
-	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
-	  if (odd)
-	    t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
-	  else
-	    t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
-	  emit_insn (t3);
-	}
-      break;
+      return expand_vec_perm_even_odd_pack (d);
 
     case V16HImode:
     case V32QImode:
-      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+      return expand_vec_perm_even_odd_pack (d);
 
     case V4DImode:
       if (!TARGET_AVX2)
@@ -44823,6 +44967,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   /* Try sequences of three instructions.  */
 
+  if (expand_vec_perm_even_odd_pack (d))
+    return true;
+
   if (expand_vec_perm_2vperm2f128_vshuf (d))
     return true;
 
@@ -46600,6 +46747,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
 {
   int res = 1;
 
+  /* Vector part.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (TARGET_VECTOR_PARALLEL_EXECUTION)
+	return 2;
+      else
+	return 1;
+    }
+
+  /* Scalar part.  */
   if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
     res = 2;
   else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
@@ -46699,7 +46856,6 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
 {
   unsigned *cost = (unsigned *) data;
   unsigned retval = 0;
-
   tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
   int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
@@ -46710,6 +46866,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
     count *= 50;  /* FIXME.  */
 
   retval = (unsigned) (count * stmt_cost);
+
+  /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
+     for Silvermont as it has out of order integer pipeline and can execute
+     2 scalar instruction per tick, but has in order SIMD pipeline.  */
+  if (TARGET_SILVERMONT || TARGET_INTEL)
+    if (stmt_info && stmt_info->stmt)
+      {
+	tree lhs_op = gimple_get_lhs (stmt_info->stmt);
+	if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
+	  retval = (retval * 17) / 10;
+      }
+
   cost[where] += retval;
 
   return retval;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index b3b7c8d3090..f6b169c24cd 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -425,6 +425,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
 #define TARGET_USE_VECTOR_CONVERTS \
 	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_SLOW_PSHUFB \
+	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index de56b9e1dce..2369e4b4098 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12159,18 +12159,52 @@
    (set_attr "mode" "<MODE>")])
 
 ;; BMI2 instructions.
-(define_insn "bmi2_bzhi_<mode>3"
+(define_expand "bmi2_bzhi_<mode>3"
+  [(parallel
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (zero_extract:SWI48
+	    (match_operand:SWI48 1 "nonimmediate_operand")
+	    (umin:SWI48
+	      (and:SWI48 (match_operand:SWI48 2 "register_operand")
+			 (const_int 255))
+	      (match_dup 3))
+	    (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_BMI2"
+  "operands[3] = GEN_INT (<MODE_SIZE> * BITS_PER_UNIT);")
+
+(define_insn "*bmi2_bzhi_<mode>3"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
-	(and:SWI48 (lshiftrt:SWI48 (const_int -1)
-				   (match_operand:SWI48 2 "register_operand" "r"))
-		   (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+	(zero_extract:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+	  (umin:SWI48
+	    (and:SWI48 (match_operand:SWI48 2 "register_operand" "r")
+		       (const_int 255))
+	    (match_operand:SWI48 3 "const_int_operand" "n"))
+	  (const_int 0)))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI2"
+  "TARGET_BMI2 && INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
   "bzhi\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "bitmanip")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<MODE>")])
 
+(define_mode_attr k [(SI "k") (DI "q")])
+(define_insn "*bmi2_bzhi_<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(zero_extract:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+	  (umin:SWI48
+	    (zero_extend:SWI48 (match_operand:QI 2 "register_operand" "r"))
+	    (match_operand:SWI48 3 "const_int_operand" "n"))
+	  (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && INTVAL (operands[3]) == <MODE_SIZE> * BITS_PER_UNIT"
+  "bzhi\t{%<k>2, %1, %0|%0, %1, %<k>2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "bmi2_pdep_<mode>3"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")
diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
index 574f096e6ca..d980fb719a3 100644
--- a/gcc/config/i386/linux-common.h
+++ b/gcc/config/i386/linux-common.h
@@ -27,11 +27,6 @@ along with GCC; see the file COPYING3.  If not see
     }                                          \
   while (0)
 
-#undef ANDROID_TARGET_CC1_SPEC
-#define ANDROID_TARGET_CC1_SPEC \
-  "%{m32:-mstackrealign -mssse3 -fno-short-enums}" \
-  "%{!m32:-msse4.2 -mpopcnt}"
-
 #undef CC1_SPEC
 #define CC1_SPEC \
   LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 2ef1384246e..8266f3eaf76 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1417,6 +1417,22 @@
   return true;
 })
 
+;; Return true if OP is a parallel for a palignr permute.
+(define_predicate "palignr_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int elt = INTVAL (XVECEXP (op, 0, 0));
+  int i, nelt = XVECLEN (op, 0);
+
+  /* Check that an order in the permutation is suitable for palignr.
+     For example, {5 6 7 0 1 2 3 4} is "palignr 5, xmm, xmm".  */
+  for (i = 1; i < nelt; ++i)
+    if (INTVAL (XVECEXP (op, 0, i)) != ((elt + i) % nelt))
+      return false;
+  return true;
+})
+
 ;; Return true if OP is a proper third operand to vpblendw256.
 (define_predicate "avx2_pblendw_operand"
   (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8d061da958a..4aced2da9b5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14576,6 +14576,35 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*ssse3_palignr<mode>_perm"
+  [(set (match_operand:V_128 0 "register_operand" "=x,x")
+      (vec_select:V_128
+	(match_operand:V_128 1 "register_operand" "0,x")
+	(match_parallel 2 "palignr_operand"
+	  [(match_operand 3 "const_int_operand" "n, n")])))]
+  "TARGET_SSSE3"
+{
+  enum machine_mode imode = GET_MODE_INNER (GET_MODE (operands[0]));
+  operands[2] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (imode));
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "palignr\t{%2, %1, %0|%0, %1, %2}";
+    case 1:
+      return "vpalignr\t{%2, %1, %1, %0|%0, %1, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")])
+
 (define_expand "avx_vinsertf128<mode>"
   [(match_operand:V_256 0 "register_operand")
    (match_operand:V_256 1 "register_operand")
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index c5c8c39d303..215c71f4df4 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -58,8 +58,8 @@ DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
    that can be partly masked by careful scheduling of moves.  */
 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
-          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
-	  | m_INTEL | m_AMDFAM10 | m_BDVER | m_GENERIC)
+          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
+	  | m_BDVER | m_GENERIC)
 
 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
    are resolved on SSE register parts instead of whole registers, so we may
@@ -386,6 +386,15 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
    from integer to FP. */
 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 
+/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
+DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
+          m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+   execute 2 or more vector instructions in parallel.  */
+DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index 41adc4adc96..4ec3e3abe6e 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -602,11 +602,6 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P ia64_vector_mode_supported_p
 
-/* ia64 architecture manual 4.4.7: ... reads, writes, and flushes may occur
-   in an order different from the specified program order.  */
-#undef TARGET_RELAXED_ORDERING
-#define TARGET_RELAXED_ORDERING true
-
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P ia64_legitimate_constant_p
 #undef TARGET_LEGITIMATE_ADDRESS_P
diff --git a/gcc/config/mips/android.h b/gcc/config/mips/android.h
new file mode 100644
index 00000000000..32c539c8d39
--- /dev/null
+++ b/gcc/config/mips/android.h
@@ -0,0 +1,49 @@
+/* Target macros for mips*-*android* targets.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef DRIVER_SELF_SPECS
+#define DRIVER_SELF_SPECS						\
+  /* Make sure a -mips option is present.  This helps us to pick	\
+     the right multilib, and also makes the later specs easier		\
+     to write.  */							\
+  MIPS_ISA_LEVEL_SPEC,							\
+									\
+  /* Infer the default float setting from -march.  */			\
+  MIPS_ARCH_FLOAT_SPEC,							\
+									\
+  /* Infer the -msynci setting from -march if not explicitly set.  */	\
+  MIPS_ISA_SYNCI_SPEC,							\
+									\
+  /* If no ABI option is specified, infer one from the ISA level	\
+     or -mgp setting.  */						\
+  "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=64}}",	\
+									\
+  /* If no FP ABI option is specified, infer one from the		\
+     ABI/ISA level unless there is a conflicting option.  */		\
+  "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{"	\
+  MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}",				\
+									\
+  /* If no odd-spreg option is specified, infer one from the ISA.  */	\
+  "%{!modd-spreg: %{mabi=32: %{mips32r6: -mno-odd-spreg}}}",		\
+									\
+  /* Base SPECs.  */							\
+  BASE_DRIVER_SELF_SPECS,						\
+									\
+  /* Use the standard linux specs for everything else.  */		\
+  LINUX_DRIVER_SELF_SPECS
diff --git a/gcc/config/mips/constraints.md b/gcc/config/mips/constraints.md
index 49e48954f51..7f5a63b4a00 100644
--- a/gcc/config/mips/constraints.md
+++ b/gcc/config/mips/constraints.md
@@ -19,7 +19,7 @@
 
 ;; Register constraints
 
-(define_register_constraint "d" "BASE_REG_CLASS"
+(define_register_constraint "d" "TARGET_MIPS16 ? M16_REGS : GR_REGS"
   "An address register.  This is equivalent to @code{r} unless
    generating MIPS16 code.")
 
@@ -92,6 +92,9 @@
 ;; but the DSP version allows any accumulator target.
 (define_register_constraint "ka" "ISA_HAS_DSP_MULT ? ACC_REGS : MD_REGS")
 
+(define_register_constraint "kb" "M16_STORE_REGS"
+  "@internal")
+
 (define_constraint "kf"
   "@internal"
   (match_operand 0 "force_to_mem_operand"))
@@ -305,6 +308,61 @@
    "@internal"
    (match_operand 0 "low_bitmask_operand"))
 
+(define_constraint "YI"
+  "@internal
+   A replicated vector const in which the replicated is a 10-bit signed
+   value."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, -1024, 1023)")))
+
+(define_constraint "YC"
+  "@internal
+   A replicated vector const in which the replicated value has a single
+   bit set."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_bitimm_set_p (op, mode)")))
+
+(define_constraint "YZ"
+  "@internal
+   A replicated vector const in which the replicated value has a single
+   bit clear."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_bitimm_clr_p (op, mode)")))
+
+(define_constraint "Unv5"
+  "@internal
+   A replicated vector const in which the replicated value is negative
+   integer number in range [-31,0]."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, -31, 0)")))
+
+(define_constraint "Uuv5"
+  "@internal
+   A replicated vector const in which the replicated value is positive
+   integer number in range [0,31]."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, 0, 31)")))
+
+(define_constraint "Uuv6"
+  "@internal
+   A replicated vector const in which the replicated value is a unsigned
+   6-bit integer number."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, 0, 63)")))
+
+(define_constraint "Uuv8"
+  "@internal
+   A replicated vector const in which the replicated value is a unsigned
+   8-bit integer number."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, 0, 255)")))
+
+(define_constraint "Ubv8"
+  "@internal
+   A replicated vector const in which the replicated value is a 8-bit byte."
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_byte_p (op, mode)")))
+
 (define_memory_constraint "ZC"
   "When compiling microMIPS code, this constraint matches a memory operand
    whose address is formed from a base register and a 12-bit offset.  These
@@ -315,16 +373,18 @@
        (if_then_else
 	 (match_test "TARGET_MICROMIPS")
 	 (match_test "umips_12bit_offset_address_p (XEXP (op, 0), mode)")
-	 (match_test "mips_address_insns (XEXP (op, 0), mode, false)"))))
+	 (if_then_else (match_test "ISA_HAS_PREF_LL_9BIT")
+	   (match_test "mips_9bit_offset_address_p (XEXP (op, 0), mode)")
+	   (match_test "mips_address_insns (XEXP (op, 0), mode, false)")))))
 
 (define_address_constraint "ZD"
-  "When compiling microMIPS code, this constraint matches an address operand
-   that is formed from a base register and a 12-bit offset.  These operands
-   can be used for microMIPS instructions such as @code{prefetch}.  When
-   not compiling for microMIPS code, @code{ZD} is equivalent to @code{p}."
+  "An address suitable for a @code{prefetch} instruction, or for any other
+   instruction with the same addressing mode as @code{prefetch}."
    (if_then_else (match_test "TARGET_MICROMIPS")
 		 (match_test "umips_12bit_offset_address_p (op, mode)")
-		 (match_test "mips_address_insns (op, mode, false)")))
+	  (if_then_else (match_test "ISA_HAS_PREF_LL_9BIT")
+			(match_test "mips_9bit_offset_address_p (op, mode)")
+			(match_test "mips_address_insns (op, mode, false)"))))
 
 (define_memory_constraint "ZR"
  "@internal
diff --git a/gcc/config/mips/gnu-user.h b/gcc/config/mips/gnu-user.h
index 02c6a3f0b26..b0033e231b5 100644
--- a/gcc/config/mips/gnu-user.h
+++ b/gcc/config/mips/gnu-user.h
@@ -53,20 +53,23 @@ along with GCC; see the file COPYING3.  If not see
 #undef MIPS_DEFAULT_GVALUE
 #define MIPS_DEFAULT_GVALUE 0
 
-/* Borrowed from sparc/linux.h */
 #undef GNU_USER_TARGET_LINK_SPEC
-#define GNU_USER_TARGET_LINK_SPEC \
- "%(endian_spec) \
-  %{shared:-shared} \
-  %{!EB:%{!meb:-m elf32ltsmip}} %{EB|meb:-m elf32btsmip} \
-  %{!shared: \
-    %{!static: \
-      %{rdynamic:-export-dynamic} \
-      -dynamic-linker " GNU_USER_DYNAMIC_LINKER "} \
-      %{static:-static}}"
+#define GNU_USER_TARGET_LINK_SPEC "\
+  %{G*} %{EB} %{EL} %{mips*} %{shared} \
+   %{!shared: \
+     %{!static: \
+       %{rdynamic:-export-dynamic} \
+       %{mabi=n32: -dynamic-linker " GNU_USER_DYNAMIC_LINKERN32 "} \
+       %{mabi=64: -dynamic-linker " GNU_USER_DYNAMIC_LINKER64 "} \
+       %{mabi=32: -dynamic-linker " GNU_USER_DYNAMIC_LINKER32 "}} \
+    %{static}} \
+  %{mabi=n32:-m" GNU_USER_LINK_EMULATIONN32 "} \
+  %{mabi=64:-m" GNU_USER_LINK_EMULATION64 "} \
+  %{mabi=32:-m" GNU_USER_LINK_EMULATION32 "}"
+
 #undef LINK_SPEC
 #define LINK_SPEC GNU_USER_TARGET_LINK_SPEC
-
+ 
 #undef SUBTARGET_ASM_SPEC
 #define SUBTARGET_ASM_SPEC \
   "%{!mno-abicalls:%{mplt:-call_nonpic;:-KPIC}} " \
@@ -125,10 +128,13 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
      specs handling by removing a redundant option.  */			\
   "%{!mno-shared:%<mplt}",						\
   /* -mplt likewise has no effect for -mabi=64 without -msym32.  */	\
-  "%{mabi=64:%{!msym32:%<mplt}}"
+  "%{mabi=64:%{!msym32:%<mplt}}",					\
+  "%{!EB:%{!EL:%(endian_spec)}}",					\
+  "%{!mabi=*: -" MULTILIB_ABI_DEFAULT "}"
 
 #undef DRIVER_SELF_SPECS
 #define DRIVER_SELF_SPECS \
+  MIPS_ISA_LEVEL_SPEC,    \
   BASE_DRIVER_SELF_SPECS, \
   LINUX_DRIVER_SELF_SPECS
 
@@ -140,3 +146,6 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define ENDFILE_SPEC \
   GNU_USER_TARGET_MATHFILE_SPEC " " \
   GNU_USER_TARGET_ENDFILE_SPEC
+
+#undef LOCAL_LABEL_PREFIX
+#define LOCAL_LABEL_PREFIX (TARGET_OLDABI ? "$" : ".")
diff --git a/gcc/config/mips/gnu-user64.h b/gcc/config/mips/gnu-user64.h
deleted file mode 100644
index b97b4a76848..00000000000
--- a/gcc/config/mips/gnu-user64.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Definitions for MIPS systems using GNU userspace and n32/64 abi.
-   Copyright (C) 2002-2014 Free Software Foundation, Inc.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-<http://www.gnu.org/licenses/>.  */
-
-/* Force the default endianness and ABI flags onto the command line
-   in order to make the other specs easier to write.  */
-
-#define LINUX64_DRIVER_SELF_SPECS \
-  LINUX_DRIVER_SELF_SPECS \
-  " %{!EB:%{!EL:%(endian_spec)}}" \
-  " %{!mabi=*: -" MULTILIB_ABI_DEFAULT "}"
-
-#undef DRIVER_SELF_SPECS
-#define DRIVER_SELF_SPECS \
-  BASE_DRIVER_SELF_SPECS, \
-  LINUX64_DRIVER_SELF_SPECS
-
-#undef GNU_USER_TARGET_LINK_SPEC
-#define GNU_USER_TARGET_LINK_SPEC "\
-%{G*} %{EB} %{EL} %{mips1} %{mips2} %{mips3} %{mips4} \
-%{shared} \
- %(endian_spec) \
-  %{!shared: \
-    %{!static: \
-      %{rdynamic:-export-dynamic} \
-      %{mabi=n32: -dynamic-linker " GNU_USER_DYNAMIC_LINKERN32 "} \
-      %{mabi=64: -dynamic-linker " GNU_USER_DYNAMIC_LINKER64 "} \
-      %{mabi=32: -dynamic-linker " GNU_USER_DYNAMIC_LINKER32 "}} \
-    %{static:-static}} \
-%{mabi=n32:-m" GNU_USER_LINK_EMULATIONN32 "} \
-%{mabi=64:-m" GNU_USER_LINK_EMULATION64 "} \
-%{mabi=32:-m" GNU_USER_LINK_EMULATION32 "}"
-#undef LINK_SPEC
-#define LINK_SPEC GNU_USER_TARGET_LINK_SPEC
-
-#undef LOCAL_LABEL_PREFIX
-#define LOCAL_LABEL_PREFIX (TARGET_OLDABI ? "$" : ".")
diff --git a/gcc/config/mips/i6400.md b/gcc/config/mips/i6400.md
new file mode 100644
index 00000000000..1950f3a7c20
--- /dev/null
+++ b/gcc/config/mips/i6400.md
@@ -0,0 +1,335 @@
+;; DFA-based pipeline description for i6400.
+
+;; Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "i6400_int_pipe, i6400_mdu_pipe, i6400_fpu_short_pipe,
+    i6400_fpu_long_pipe")
+
+(define_cpu_unit "i6400_gpmuldiv" "i6400_mdu_pipe")
+
+(define_cpu_unit "i6400_agen, i6400_alu1, i6400_lsu" "i6400_int_pipe")
+
+(define_cpu_unit "i6400_control, i6400_ctu, i6400_alu0" "i6400_int_pipe")
+
+;; The floating-point-unit queue (FPQ) has short and long pipes
+
+;; Short FPU pipeline.
+(define_cpu_unit "i6400_fpu_short, i6400_fpu_intadd, i6400_fpu_logic,
+    i6400_fpu_div, i6400_fpu_cmp, i6400_fpu_float, i6400_fpu_store"
+    "i6400_fpu_short_pipe")
+
+;; Long FPU pipeline.
+(define_cpu_unit "i6400_fpu_long, i6400_fpu_logic_l, i6400_fpu_float_l,
+    i6400_fpu_mult, i6400_fpu_apu" "i6400_fpu_long_pipe")
+
+(define_reservation "i6400_control_ctu" "i6400_control, i6400_ctu")
+(define_reservation "i6400_control_alu0" "i6400_control, i6400_alu0")
+(define_reservation "i6400_agen_lsu" "i6400_agen, i6400_lsu")
+(define_reservation "i6400_agen_alu1" "i6400_agen, i6400_alu1")
+
+;;
+;; FPU-MSA pipe
+;;
+
+;; Short pipe
+
+;; addv, subv
+(define_insn_reservation "i6400_msa_add_d" 1
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "!d")
+	    (and (eq_attr "alu_type" "add, sub")
+		 (eq_attr "msa_execunit" "msa_eu_int_add"))))
+  "i6400_fpu_short, i6400_fpu_intadd")
+
+;; add, hadd, sub, hsub, average, min, max, compare
+(define_insn_reservation "i6400_msa_int_add" 2
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_int_add"))
+  "i6400_fpu_short, i6400_fpu_intadd")
+
+;; sat, pcnt
+(define_insn_reservation "i6400_msa_short_logic3" 3
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_logic3"))
+  "i6400_fpu_short, i6400_fpu_logic")
+
+;; shf.d
+(define_insn_reservation "i6400_msa_shf_d" 1
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "d")
+            (eq_attr "msa_execunit" "msa_eu_logic2")))
+  "i6400_fpu_short, i6400_fpu_logic")
+
+;; shifts, nloc, nlzc, bneg, bclr, shf
+(define_insn_reservation "i6400_msa_short_logic2" 2
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_logic2"))
+  "i6400_fpu_short, i6400_fpu_logic")
+
+;; and, or, xor, ilv, pck, move.v, fill, splat
+(define_insn_reservation "i6400_msa_short_logic" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_logic"))
+  "i6400_fpu_short, i6400_fpu_logic")
+
+;; move.v, ldi
+(define_insn_reservation "i6400_msa_move" 1
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "type" "fmove")
+	    (eq_attr "mode" "TI")))
+  "i6400_fpu_short, i6400_fpu_logic")
+
+;; Float compare New: CMP.cond.fmt
+(define_insn_reservation "i6400_msa_cmp" 2
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_cmp"))
+  "i6400_fpu_short, i6400_fpu_cmp")
+
+;; Float min, max, class
+(define_insn_reservation "i6400_msa_short_float2" 2
+  (and (eq_attr "cpu" "i6400")
+       (ior (and (eq_attr "msa_execunit" "msa_eu_float2")
+		 (eq_attr "type" "!fmul"))
+	    (and (eq_attr "msa_execunit" "msa_eu_float2_l")
+		 (eq_attr "type" "fcmp"))))
+  "i6400_fpu_short, i6400_fpu_float")
+
+;; div.d, mod.d (non-pipelined)
+(define_insn_reservation "i6400_msa_div_d" 36
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "d")
+	    (eq_attr "msa_execunit" "msa_eu_div")))
+  "i6400_fpu_short+i6400_fpu_div*36")
+
+;; div.w, mod.w (non-pipelined)
+(define_insn_reservation "i6400_msa_div_w" 20
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "w")
+	    (eq_attr "msa_execunit" "msa_eu_div")))
+  "i6400_fpu_short+i6400_fpu_div*20")
+
+;; div.h, mod.h (non-pipelined)
+(define_insn_reservation "i6400_msa_div_h" 12
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "h")
+	    (eq_attr "msa_execunit" "msa_eu_div")))
+  "i6400_fpu_short+i6400_fpu_div*12")
+
+;; div.b, mod.b (non-pipelined)
+(define_insn_reservation "i6400_msa_div_b" 8
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "datafmt" "b")
+	    (eq_attr "msa_execunit" "msa_eu_div")))
+  "i6400_fpu_short+i6400_fpu_div*8")
+
+;; Vector copy
+(define_insn_reservation "i6400_msa_copy" 1
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "msa_execunit" "msa_eu_store4")
+	    (eq_attr "type" "mfc, mtc")))
+  "i6400_fpu_short, i6400_fpu_store")
+
+;; Vector bz, bnz
+(define_insn_reservation "i6400_msa_branch" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_store4"))
+  "i6400_control_ctu")
+
+;; Vector store, sdc1, swc1
+(define_insn_reservation "i6400_fpu_msa_store" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fpstore"))
+  "i6400_agen_lsu")
+
+;; Vector load, ldc1, lwc1
+(define_insn_reservation "i6400_fpu_msa_load" 3
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fpload"))
+  "i6400_agen_lsu")
+
+;; mfc, mtc
+(define_insn_reservation "i6400_fpu_move" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "move_type" "mfc, mtc"))
+  "i6400_control_alu0 | i6400_agen_alu1")
+
+;; Long pipe
+
+;; bmz, bmnz, bsel, insert, insve
+(define_insn_reservation "i6400_msa_long_logic1" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_logic_l"))
+  "i6400_fpu_long, i6400_fpu_logic_l")
+
+;; binsl, binsr, vshf, sld
+(define_insn_reservation "i6400_msa_long_logic2" 2
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_logic_l2"))
+  "i6400_fpu_long, i6400_fpu_logic_l")
+
+;; Vector mul, dotp, madd, msub
+(define_insn_reservation "i6400_msa_mult" 5
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_mult"))
+  "i6400_fpu_long, i6400_fpu_mult")
+
+;; Float flog2
+(define_insn_reservation "i6400_msa_long_float2" 2
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "msa_execunit" "msa_eu_float2_l")
+	    (eq_attr "type" "fmul")))
+  "i6400_fpu_long, i6400_fpu_float_l")
+
+;; fadd, fsub
+(define_insn_reservation "i6400_msa_long_float4" 4
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_float4"))
+  "i6400_fpu_long, i6400_fpu_float_l")
+
+;; fmul, fexp2
+(define_insn_reservation "i6400_msa_long_float5" 5
+  (and (eq_attr "cpu" "i6400")
+       (ior (eq_attr "msa_execunit" "msa_eu_float5")
+	    (and (eq_attr "msa_execunit" "msa_eu_float2")
+		 (eq_attr "type" "fmul"))))
+  "i6400_fpu_long, i6400_fpu_float_l")
+
+;; fmadd, fmsub
+(define_insn_reservation "i6400_msa_long_float8" 8
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_float8"))
+  "i6400_fpu_long, i6400_fpu_float_l")
+
+;; fdiv.d
+(define_insn_reservation "i6400_msa_fdiv_df" 30
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "mode" "DF")
+	    (eq_attr "msa_execunit" "msa_eu_fdiv")))
+  "i6400_fpu_long+i6400_fpu_float_l*30")
+
+;; fdiv.w
+(define_insn_reservation "i6400_msa_fdiv_sf" 22
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "msa_execunit" "msa_eu_fdiv"))
+  "i6400_fpu_long+i6400_fpu_float_l*22")
+
+;;
+;; FPU pipe
+;;
+
+;; fabs, fneg
+(define_insn_reservation "i6400_fpu_fabs" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fabs,fneg,fmove"))
+  "i6400_fpu_short, i6400_fpu_apu")
+
+;; fadd, fsub, fcvt
+(define_insn_reservation "i6400_fpu_fadd" 4
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fadd, fcvt"))
+  "i6400_fpu_long, i6400_fpu_apu")
+
+;; fmul
+(define_insn_reservation "i6400_fpu_fmul" 5
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fmul"))
+  "i6400_fpu_long, i6400_fpu_apu")
+
+;; div, sqrt (Double Precision)
+(define_insn_reservation "i6400_fpu_div_df" 30
+  (and (eq_attr "cpu" "i6400")
+       (and (eq_attr "mode" "DF")
+	    (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt")))
+  "i6400_fpu_long+i6400_fpu_apu*30")
+
+;; div, sqrt (Single Precision)
+(define_insn_reservation "i6400_fpu_div_sf" 22
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt"))
+  "i6400_fpu_long+i6400_fpu_apu*22")
+
+;;
+;; Integer pipe
+;;
+
+;; and, lui, shifts, seb, seh
+(define_insn_reservation "i6400_int_logical" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "move_type" "logical,const,andi,sll0,signext"))
+  "i6400_control_alu0 | i6400_agen_alu1")
+
+;; addi, addiu, ori, xori, add, addu, sub, nor
+(define_insn_reservation "i6400_int_add" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "alu_type" "add,sub,or,xor,nor")) 
+  "i6400_control_alu0 | i6400_agen_alu1")
+
+;; shifts, clo, clz, cond move, arith
+(define_insn_reservation "i6400_int_arith" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "shift,slt,move,clz,condmove,arith"))
+  "i6400_control_alu0 | i6400_agen_alu1")
+
+;; nop
+(define_insn_reservation "i6400_int_nop" 0
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "nop"))
+  "nothing")
+
+;; mult, multu, mul
+(define_insn_reservation "i6400_int_mult" 4
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "imul3,imul"))
+  "i6400_gpmuldiv")
+
+;; divide
+(define_insn_reservation "i6400_int_div" 32
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "idiv"))
+  "i6400_gpmuldiv*32")
+
+;; Load lb, lbu, lh, lhu, lq, lw, lw_i2f, lwxs
+(define_insn_reservation "i6400_int_load" 3
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "move_type" "load"))
+  "i6400_agen_lsu")
+
+;; store
+(define_insn_reservation "i6400_int_store" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "move_type" "store"))
+  "i6400_agen_lsu")
+
+;; prefetch
+(define_insn_reservation "i6400_int_prefetch" 3
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "prefetch"))
+  "i6400_agen_lsu")
+
+;; branch and jump
+(define_insn_reservation "i6400_int_branch" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "type" "branch,jump"))
+  "i6400_control_ctu")
+
+;; call
+(define_insn_reservation "i6400_int_call" 1
+  (and (eq_attr "cpu" "i6400")
+       (eq_attr "jal" "indirect,direct"))
+  "i6400_control_ctu")
diff --git a/gcc/config/mips/linux.h b/gcc/config/mips/linux.h
index e539422d48d..a117f90fb03 100644
--- a/gcc/config/mips/linux.h
+++ b/gcc/config/mips/linux.h
@@ -17,9 +17,27 @@ You should have received a copy of the GNU General Public License
 along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
-#define GLIBC_DYNAMIC_LINKER \
+#define GNU_USER_LINK_EMULATION32 "elf32%{EB:b}%{EL:l}tsmip"
+#define GNU_USER_LINK_EMULATION64 "elf64%{EB:b}%{EL:l}tsmip"
+#define GNU_USER_LINK_EMULATIONN32 "elf32%{EB:b}%{EL:l}tsmipn32"
+
+#define GLIBC_DYNAMIC_LINKER32 \
   "%{mnan=2008:/lib/ld-linux-mipsn8.so.1;:/lib/ld.so.1}"
+#define GLIBC_DYNAMIC_LINKER64 \
+  "%{mnan=2008:/lib64/ld-linux-mipsn8.so.1;:/lib64/ld.so.1}"
+#define GLIBC_DYNAMIC_LINKERN32 \
+  "%{mnan=2008:/lib32/ld-linux-mipsn8.so.1;:/lib32/ld.so.1}"
 
-#undef UCLIBC_DYNAMIC_LINKER
-#define UCLIBC_DYNAMIC_LINKER \
+#undef UCLIBC_DYNAMIC_LINKER32
+#define UCLIBC_DYNAMIC_LINKER32 \
   "%{mnan=2008:/lib/ld-uClibc-mipsn8.so.0;:/lib/ld-uClibc.so.0}"
+#undef UCLIBC_DYNAMIC_LINKER64
+#define UCLIBC_DYNAMIC_LINKER64 \
+  "%{mnan=2008:/lib/ld64-uClibc-mipsn8.so.0;:/lib/ld64-uClibc.so.0}"
+#define UCLIBC_DYNAMIC_LINKERN32 \
+  "%{mnan=2008:/lib32/ld-uClibc-mipsn8.so.0;:/lib32/ld-uClibc.so.0}"
+
+#define BIONIC_DYNAMIC_LINKERN32 "/system/bin/linker32"
+#define GNU_USER_DYNAMIC_LINKERN32 \
+  CHOOSE_DYNAMIC_LINKER (GLIBC_DYNAMIC_LINKERN32, UCLIBC_DYNAMIC_LINKERN32, \
+                         BIONIC_DYNAMIC_LINKERN32)
diff --git a/gcc/config/mips/linux64.h b/gcc/config/mips/linux64.h
deleted file mode 100644
index 7ad3b2af2b9..00000000000
--- a/gcc/config/mips/linux64.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Definitions for MIPS running Linux-based GNU systems with ELF format
-   using n32/64 abi.
-   Copyright (C) 2002-2014 Free Software Foundation, Inc.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 3, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-<http://www.gnu.org/licenses/>.  */
-
-#define GNU_USER_LINK_EMULATION32 "elf32%{EB:b}%{EL:l}tsmip"
-#define GNU_USER_LINK_EMULATION64 "elf64%{EB:b}%{EL:l}tsmip"
-#define GNU_USER_LINK_EMULATIONN32 "elf32%{EB:b}%{EL:l}tsmipn32"
-
-#define GLIBC_DYNAMIC_LINKER32 \
-  "%{mnan=2008:/lib/ld-linux-mipsn8.so.1;:/lib/ld.so.1}"
-#define GLIBC_DYNAMIC_LINKER64 \
-  "%{mnan=2008:/lib64/ld-linux-mipsn8.so.1;:/lib64/ld.so.1}"
-#define GLIBC_DYNAMIC_LINKERN32 \
-  "%{mnan=2008:/lib32/ld-linux-mipsn8.so.1;:/lib32/ld.so.1}"
-
-#undef UCLIBC_DYNAMIC_LINKER32
-#define UCLIBC_DYNAMIC_LINKER32 \
-  "%{mnan=2008:/lib/ld-uClibc-mipsn8.so.0;:/lib/ld-uClibc.so.0}"
-#undef UCLIBC_DYNAMIC_LINKER64
-#define UCLIBC_DYNAMIC_LINKER64 \
-  "%{mnan=2008:/lib/ld64-uClibc-mipsn8.so.0;:/lib/ld64-uClibc.so.0}"
-#define UCLIBC_DYNAMIC_LINKERN32 \
-  "%{mnan=2008:/lib32/ld-uClibc-mipsn8.so.0;:/lib32/ld-uClibc.so.0}"
-
-#define BIONIC_DYNAMIC_LINKERN32 "/system/bin/linker32"
-#define GNU_USER_DYNAMIC_LINKERN32 \
-  CHOOSE_DYNAMIC_LINKER (GLIBC_DYNAMIC_LINKERN32, UCLIBC_DYNAMIC_LINKERN32, \
-			 BIONIC_DYNAMIC_LINKERN32)
diff --git a/gcc/config/mips/loongson.md b/gcc/config/mips/loongson.md
index 474033d1e2c..08691313c69 100644
--- a/gcc/config/mips/loongson.md
+++ b/gcc/config/mips/loongson.md
@@ -907,33 +907,3 @@
   mips_expand_vec_reduc (operands[0], operands[1], gen_umin<mode>3);
   DONE;
 })
-
-;; Integer division and modulus.  For integer multiplication, see mips.md.
-
-(define_insn "<u>div<mode>3"
-  [(set (match_operand:GPR 0 "register_operand" "=&d")
-	(any_div:GPR (match_operand:GPR 1 "register_operand" "d")
-		     (match_operand:GPR 2 "register_operand" "d")))]
-  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A"
-  {
-    if (TARGET_LOONGSON_2EF)
-      return mips_output_division ("<d>div<u>.g\t%0,%1,%2", operands);
-    else
-      return mips_output_division ("gs<d>div<u>\t%0,%1,%2", operands);
-  }
-  [(set_attr "type" "idiv3")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "<u>mod<mode>3"
-  [(set (match_operand:GPR 0 "register_operand" "=&d")
-	(any_mod:GPR (match_operand:GPR 1 "register_operand" "d")
-		     (match_operand:GPR 2 "register_operand" "d")))]
-  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A"
-  {
-    if (TARGET_LOONGSON_2EF)
-      return mips_output_division ("<d>mod<u>.g\t%0,%1,%2", operands);
-    else
-      return mips_output_division ("gs<d>mod<u>\t%0,%1,%2", operands);
-  }
-  [(set_attr "type" "idiv3")
-   (set_attr "mode" "<MODE>")])
diff --git a/gcc/config/mips/mips-cpus.def b/gcc/config/mips/mips-cpus.def
index 07fbf9c7ef4..94e68873e0b 100644
--- a/gcc/config/mips/mips-cpus.def
+++ b/gcc/config/mips/mips-cpus.def
@@ -44,9 +44,19 @@ MIPS_CPU ("mips4", PROCESSOR_R8000, 4, 0)
    isn't tuned to a specific processor.  */
 MIPS_CPU ("mips32", PROCESSOR_4KC, 32, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("mips32r2", PROCESSOR_74KF2_1, 33, PTF_AVOID_BRANCHLIKELY)
+/* mips32r3 is micromips hense why it uses the M4K processor.
+   mips32r5 should use the p5600 processor, but there is no definition 
+   for this yet, so in the short term we will use the same processor entry 
+   as mips32r2.  */
+MIPS_CPU ("mips32r3", PROCESSOR_M4K, 34, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("mips32r5", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("mips32r6", PROCESSOR_W32, 37, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("mips64", PROCESSOR_5KC, 64, PTF_AVOID_BRANCHLIKELY)
-/* ??? For now just tune the generic MIPS64r2 for 5KC as well.   */
+/* ??? For now just tune the generic MIPS64r2 and above for 5KC as well.   */
 MIPS_CPU ("mips64r2", PROCESSOR_5KC, 65, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("mips64r3", PROCESSOR_5KC, 66, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("mips64r5", PROCESSOR_5KC, 68, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("mips64r6", PROCESSOR_I6400, 69, PTF_AVOID_BRANCHLIKELY)
 
 /* MIPS I processors.  */
 MIPS_CPU ("r3000", PROCESSOR_R3000, 1, 0)
@@ -137,6 +147,9 @@ MIPS_CPU ("1004kf2_1", PROCESSOR_24KF2_1, 33, 0)
 MIPS_CPU ("1004kf", PROCESSOR_24KF2_1, 33, 0)
 MIPS_CPU ("1004kf1_1", PROCESSOR_24KF1_1, 33, 0)
 
+/* MIPS32 Release 5 processors.  */
+MIPS_CPU ("p5600", PROCESSOR_P5600, 36, PTF_AVOID_BRANCHLIKELY)
+
 /* MIPS64 processors.  */
 MIPS_CPU ("5kc", PROCESSOR_5KC, 64, 0)
 MIPS_CPU ("5kf", PROCESSOR_5KF, 64, 0)
@@ -151,4 +164,8 @@ MIPS_CPU ("loongson3a", PROCESSOR_LOONGSON_3A, 65, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("octeon", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("octeon+", PROCESSOR_OCTEON, 65, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("octeon2", PROCESSOR_OCTEON2, 65, PTF_AVOID_BRANCHLIKELY)
+MIPS_CPU ("octeon3", PROCESSOR_OCTEON3, 65, PTF_AVOID_BRANCHLIKELY)
 MIPS_CPU ("xlp", PROCESSOR_XLP, 65, PTF_AVOID_BRANCHLIKELY)
+
+/* MIPS64 Release 6 processors.  */
+MIPS_CPU ("i6400", PROCESSOR_I6400, 69, PTF_AVOID_BRANCHLIKELY)
diff --git a/gcc/config/mips/mips-dsp.md b/gcc/config/mips/mips-dsp.md
index 58c11fe9af0..54d1b2a6867 100644
--- a/gcc/config/mips/mips-dsp.md
+++ b/gcc/config/mips/mips-dsp.md
@@ -1185,8 +1185,21 @@
 		      (label_ref (match_operand 0 "" ""))
 		      (pc)))]
   "ISA_HAS_DSP"
-  "%*bposge%1\t%0%/"
-  [(set_attr "type"	"branch")])
+{
+  if (TARGET_DSPR3 && TARGET_CB_MAYBE)
+    return "%*bposge%1%:\t%0";
+  else
+    return "%*bposge%1\t%0%/";
+}
+  [(set_attr "type"	"branch")
+   (set (attr "compact_form") (if_then_else (match_test "TARGET_DSPR3
+							 && TARGET_CB_MAYBE")
+					      (const_string "maybe")
+					      (const_string "never")))
+   (set (attr "hazard") (if_then_else (match_test "TARGET_DSPR3
+						   && TARGET_CB_MAYBE")
+					      (const_string "forbidden_slot")
+					      (const_string "none")))])
 
 (define_expand "mips_madd<u>"
   [(set (match_operand:DI 0 "register_operand")
diff --git a/gcc/config/mips/mips-ftypes.def b/gcc/config/mips/mips-ftypes.def
index 078a595a8b1..fde206d1cea 100644
--- a/gcc/config/mips/mips-ftypes.def
+++ b/gcc/config/mips/mips-ftypes.def
@@ -36,6 +36,230 @@ along with GCC; see the file COPYING3.  If not see
 DEF_MIPS_FTYPE (1, (DF, DF))
 DEF_MIPS_FTYPE (2, (DF, DF, DF))
 
+DEF_MIPS_FTYPE (2, (V16QI, V16QI, V16QI))
+DEF_MIPS_FTYPE (2, (V8HI, V8HI, V8HI))
+DEF_MIPS_FTYPE (2, (V4SI, V4SI, V4SI))
+DEF_MIPS_FTYPE (2, (V2DI, V2DI, V2DI))
+
+DEF_MIPS_FTYPE (2, (UV16QI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (2, (UV8HI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (2, (UV4SI, UV4SI, UV4SI))
+DEF_MIPS_FTYPE (2, (UV2DI, UV2DI, UV2DI))
+
+DEF_MIPS_FTYPE (2, (V16QI, V16QI, UQI))
+DEF_MIPS_FTYPE (2, (V8HI, V8HI, UQI))
+DEF_MIPS_FTYPE (2, (V4SI, V4SI, UQI))
+DEF_MIPS_FTYPE (2, (V2DI, V2DI, UQI))
+
+DEF_MIPS_FTYPE (2, (V16QI, V16QI, QI))
+DEF_MIPS_FTYPE (2, (V8HI, V8HI, QI))
+DEF_MIPS_FTYPE (2, (V4SI, V4SI, QI))
+DEF_MIPS_FTYPE (2, (V2DI, V2DI, QI))
+
+DEF_MIPS_FTYPE (2, (UV16QI, UV16QI, UQI))
+DEF_MIPS_FTYPE (2, (UV8HI, UV8HI, UQI))
+DEF_MIPS_FTYPE (2, (UV4SI, UV4SI, UQI))
+DEF_MIPS_FTYPE (2, (UV2DI, UV2DI, UQI))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, V16QI, V16QI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, V8HI, V8HI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, V4SI, V4SI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, V2DI, V2DI))
+DEF_MIPS_FTYPE (3, (V4SF, V4SI, V4SF, V4SF))
+DEF_MIPS_FTYPE (3, (V2DF, V2DI, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, V16QI, UQI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, V8HI, UQI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, V4SI, UQI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, V2DI, UQI))
+
+DEF_MIPS_FTYPE (2, (SI, V16QI, UQI))
+DEF_MIPS_FTYPE (2, (SI, V8HI, UQI))
+DEF_MIPS_FTYPE (2, (SI, V4SI, UQI))
+DEF_MIPS_FTYPE (2, (SF, V4SF, UQI))
+
+DEF_MIPS_FTYPE (2, (DI, V2DI, UQI))
+DEF_MIPS_FTYPE (2, (DF, V2DF, UQI))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, SI, UQI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, SI, UQI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, SI, UQI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, DI, UQI))
+DEF_MIPS_FTYPE (3, (V4SF, V4SF, SF, UQI))
+DEF_MIPS_FTYPE (3, (V2DF, V2DF, DF, UQI))
+
+DEF_MIPS_FTYPE (2, (V8HI, V16QI, V16QI))
+DEF_MIPS_FTYPE (2, (V4SI, V8HI, V8HI))
+DEF_MIPS_FTYPE (2, (V2DI, V4SI, V4SI))
+DEF_MIPS_FTYPE (2, (UV8HI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (2, (UV4SI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (2, (UV2DI, UV4SI, UV4SI))
+
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, V16QI, V16QI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, V8HI, V8HI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, V4SI, V4SI))
+DEF_MIPS_FTYPE (3, (UV8HI, UV8HI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (3, (UV4SI, UV4SI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (3, (UV2DI, UV2DI, UV4SI, UV4SI))
+
+DEF_MIPS_FTYPE (2, (V4SF, V4SF, V4SF))
+DEF_MIPS_FTYPE (2, (V2DF, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (2, (V4SI, V4SF, V4SF))
+DEF_MIPS_FTYPE (2, (V2DI, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (1, (V4SI, V4SF))
+DEF_MIPS_FTYPE (1, (V2DI, V2DF))
+
+DEF_MIPS_FTYPE (2, (V4SF, V4SF, V4SI))
+DEF_MIPS_FTYPE (2, (V2DF, V2DF, V2DI))
+
+DEF_MIPS_FTYPE (1, (V4SF, V4SI))
+DEF_MIPS_FTYPE (1, (V2DF, V2DI))
+
+DEF_MIPS_FTYPE (1, (V4SF, UV4SI))
+DEF_MIPS_FTYPE (1, (V2DF, UV2DI))
+
+DEF_MIPS_FTYPE (1, (V4SF, V8HI))
+DEF_MIPS_FTYPE (1, (V2DF, V4SI))
+
+DEF_MIPS_FTYPE (3, (V4SF, V4SF, V4SF, V4SF))
+DEF_MIPS_FTYPE (3, (V2DF, V2DF, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (1, (UV4SI, V4SF))
+DEF_MIPS_FTYPE (1, (UV2DI, V2DF))
+
+DEF_MIPS_FTYPE (2, (V8HI, V4SF, V4SF))
+DEF_MIPS_FTYPE (2, (V4SI, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (1, (V16QI, V16QI))
+DEF_MIPS_FTYPE (1, (V8HI, V8HI))
+DEF_MIPS_FTYPE (1, (V4SI, V4SI))
+DEF_MIPS_FTYPE (1, (V2DI, V2DI))
+DEF_MIPS_FTYPE (1, (V4SF, V4SF))
+DEF_MIPS_FTYPE (1, (V2DF, V2DF))
+
+DEF_MIPS_FTYPE (2, (UV16QI, V16QI, V16QI))
+DEF_MIPS_FTYPE (2, (UV8HI, V8HI, V8HI))
+DEF_MIPS_FTYPE (2, (UV4SI, V4SI, V4SI))
+DEF_MIPS_FTYPE (2, (UV2DI, V2DI, V2DI))
+
+DEF_MIPS_FTYPE (2, (V16QI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (2, (V8HI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (2, (V4SI, UV4SI, UV4SI))
+DEF_MIPS_FTYPE (2, (V2DI, UV2DI, UV2DI))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, V16QI, SI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, V8HI, SI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, V4SI, SI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, V2DI, SI))
+DEF_MIPS_FTYPE (3, (V4SF, V4SF, V4SF, SI))
+DEF_MIPS_FTYPE (3, (V2DF, V2DF, V2DF, SI))
+
+DEF_MIPS_FTYPE (3, (V4SF, V4SF, V4SF, UQI))
+DEF_MIPS_FTYPE (3, (V2DF, V2DF, V2DF, UQI))
+
+DEF_MIPS_FTYPE (2, (V16QI, V16QI, SI))
+DEF_MIPS_FTYPE (2, (V8HI, V8HI, SI))
+DEF_MIPS_FTYPE (2, (V4SI, V4SI, SI))
+DEF_MIPS_FTYPE (2, (V2DI, V2DI, SI))
+DEF_MIPS_FTYPE (2, (V4SF, V4SF, SI))
+DEF_MIPS_FTYPE (2, (V2DF, V2DF, SI))
+
+DEF_MIPS_FTYPE (2, (V4SF, V4SF, UQI))
+DEF_MIPS_FTYPE (2, (V2DF, V2DF, UQI))
+
+DEF_MIPS_FTYPE (1, (V16QI, SI))
+DEF_MIPS_FTYPE (1, (V8HI, SI))
+DEF_MIPS_FTYPE (1, (V4SI, SI))
+DEF_MIPS_FTYPE (1, (V2DI, DI))
+DEF_MIPS_FTYPE (1, (V4SF, SF))
+DEF_MIPS_FTYPE (1, (V2DF, DF))
+
+DEF_MIPS_FTYPE (1, (V16QI, HI))
+DEF_MIPS_FTYPE (1, (V8HI, HI))
+DEF_MIPS_FTYPE (1, (V4SI, HI))
+DEF_MIPS_FTYPE (1, (V2DI, HI))
+DEF_MIPS_FTYPE (1, (V4SF, HI))
+DEF_MIPS_FTYPE (1, (V2DF, HI))
+
+DEF_MIPS_FTYPE (1, (SI, UQI))
+DEF_MIPS_FTYPE (2, (VOID, UQI, SI))
+
+/* V8HF is not supported yet.  */
+/* DEF_MIPS_FTYPE (1, (V4SF, V8HF)) */
+/* DEF_MIPS_FTYPE (2, (V8HF, V4SF, V4SF)) */
+
+DEF_MIPS_FTYPE (1, (V2DF, V4SF))
+DEF_MIPS_FTYPE (2, (V4SF, V2DF, V2DF))
+
+DEF_MIPS_FTYPE (2, (V16QI, POINTER, SI))
+DEF_MIPS_FTYPE (2, (V8HI, POINTER, SI))
+DEF_MIPS_FTYPE (2, (V4SI, POINTER, SI))
+DEF_MIPS_FTYPE (2, (V2DI, POINTER, SI))
+DEF_MIPS_FTYPE (2, (V4SF, POINTER, SI))
+DEF_MIPS_FTYPE (2, (V2DF, POINTER, SI))
+
+DEF_MIPS_FTYPE (3, (VOID, V16QI, POINTER, SI))
+DEF_MIPS_FTYPE (3, (VOID, V8HI, POINTER, SI))
+DEF_MIPS_FTYPE (3, (VOID, V4SI, POINTER, SI))
+DEF_MIPS_FTYPE (3, (VOID, V2DI, POINTER, SI))
+DEF_MIPS_FTYPE (3, (VOID, V4SF, POINTER, SI))
+DEF_MIPS_FTYPE (3, (VOID, V2DF, POINTER, SI))
+
+DEF_MIPS_FTYPE (1, (SI, V16QI))
+DEF_MIPS_FTYPE (1, (SI, V8HI))
+DEF_MIPS_FTYPE (1, (SI, V4SI))
+DEF_MIPS_FTYPE (1, (SI, V2DI))
+DEF_MIPS_FTYPE (1, (SI, V4SF))
+DEF_MIPS_FTYPE (1, (SI, V2DF))
+
+DEF_MIPS_FTYPE (1, (SF, V4SF))
+DEF_MIPS_FTYPE (1, (DF, V2DF))
+
+DEF_MIPS_FTYPE (2, (UV16QI, UV16QI, V16QI))
+DEF_MIPS_FTYPE (2, (UV8HI, UV8HI, V8HI))
+DEF_MIPS_FTYPE (2, (UV4SI, UV4SI, V4SI))
+DEF_MIPS_FTYPE (2, (UV2DI, UV2DI, V2DI))
+
+DEF_MIPS_FTYPE (2, (V8HI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (2, (V4SI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (2, (V2DI, UV4SI, UV4SI))
+
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, UV4SI, UV4SI))
+
+DEF_MIPS_FTYPE (3, (UV16QI, UV16QI, UV16QI, UV16QI))
+DEF_MIPS_FTYPE (3, (UV8HI, UV8HI, UV8HI, UV8HI))
+DEF_MIPS_FTYPE (3, (UV4SI, UV4SI, UV4SI, UV4SI))
+DEF_MIPS_FTYPE (3, (UV2DI, UV2DI, UV2DI, UV2DI))
+
+DEF_MIPS_FTYPE (3, (UV16QI, UV16QI, UV16QI, UQI))
+DEF_MIPS_FTYPE (3, (UV8HI, UV8HI, UV8HI, UQI))
+DEF_MIPS_FTYPE (3, (UV4SI, UV4SI, UV4SI, UQI))
+DEF_MIPS_FTYPE (3, (UV2DI, UV2DI, UV2DI, UQI))
+
+DEF_MIPS_FTYPE (1, (SI, UV16QI))
+DEF_MIPS_FTYPE (1, (SI, UV8HI))
+DEF_MIPS_FTYPE (1, (SI, UV4SI))
+DEF_MIPS_FTYPE (1, (SI, UV2DI))
+
+DEF_MIPS_FTYPE (2, (V16QI, UV16QI, UQI))
+DEF_MIPS_FTYPE (2, (V8HI, UV8HI, UQI))
+DEF_MIPS_FTYPE (2, (V4SI, UV4SI, UQI))
+DEF_MIPS_FTYPE (2, (V2DI, UV2DI, UQI))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, UQI, SI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, UQI, SI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, UQI, SI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, UQI, DI))
+
+DEF_MIPS_FTYPE (3, (V16QI, V16QI, UQI, V16QI))
+DEF_MIPS_FTYPE (3, (V8HI, V8HI, UQI, V8HI))
+DEF_MIPS_FTYPE (3, (V4SI, V4SI, UQI, V4SI))
+DEF_MIPS_FTYPE (3, (V2DI, V2DI, UQI, V2DI))
+
 DEF_MIPS_FTYPE (2, (DI, DI, DI))
 DEF_MIPS_FTYPE (2, (DI, DI, SI))
 DEF_MIPS_FTYPE (3, (DI, DI, SI, SI))
diff --git a/gcc/config/mips/mips-modes.def b/gcc/config/mips/mips-modes.def
index fa1d1e7d682..d87d100192d 100644
--- a/gcc/config/mips/mips-modes.def
+++ b/gcc/config/mips/mips-modes.def
@@ -24,11 +24,17 @@ VECTOR_MODES (INT, 4);        /* V4QI  V2HI      */
 VECTOR_MODES (INT, 8);        /* V8QI  V4HI V2SI */
 VECTOR_MODES (FLOAT, 8);      /*       V4HF V2SF */
 
+/* For MIPS MSA 128 bits.  */
+VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF */
+
 /* Double-sized vector modes for vec_concat.  */
-VECTOR_MODE (INT, QI, 16);    /* V16QI           */
-VECTOR_MODE (INT, HI, 8);     /*       V8HI      */
-VECTOR_MODE (INT, SI, 4);     /*            V4SI */
-VECTOR_MODE (FLOAT, SF, 4);   /*            V4SF */
+VECTOR_MODE (INT, QI, 32);    /* V32QI                */
+VECTOR_MODE (INT, HI, 16);    /*       V16HI          */
+VECTOR_MODE (INT, SI, 8);     /*            V8SI      */
+VECTOR_MODE (INT, DI, 4);     /*                 V4DI */
+VECTOR_MODE (FLOAT, SF, 8);   /*            V8SF      */
+VECTOR_MODE (FLOAT, DF, 4);   /*                 V4DF */
 
 VECTOR_MODES (FRACT, 4);	/* V4QQ  V2HQ */
 VECTOR_MODES (UFRACT, 4);	/* V4UQQ V2UHQ */
@@ -46,3 +52,6 @@ ADJUST_ALIGNMENT (CCV4, 16);
 
 /* For MIPS DSP control registers.  */
 CC_MODE (CCDSP);
+
+/* For floating point conditions in FP registers.  */
+CC_MODE (CCF);
diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md
new file mode 100644
index 00000000000..b2da26ddb84
--- /dev/null
+++ b/gcc/config/mips/mips-msa.md
@@ -0,0 +1,3264 @@
+;; Machine Description for MIPS MSA ASE
+;; Based on the MIPS MSA spec Revision 1.07 30/8/2013
+;; Contributed by Chao-ying Fu (fu@mips.com), MIPS Technologies, Inc.
+;;
+;; Copyright (C) 2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+
+(define_c_enum "unspec" [
+  UNSPEC_MSA_ASUB_S
+  UNSPEC_MSA_ASUB_U
+  UNSPEC_MSA_AVE_S
+  UNSPEC_MSA_AVE_U
+  UNSPEC_MSA_AVER_S
+  UNSPEC_MSA_AVER_U
+  UNSPEC_MSA_BCLR
+  UNSPEC_MSA_BCLRI
+  UNSPEC_MSA_BINSL
+  UNSPEC_MSA_BINSLI
+  UNSPEC_MSA_BINSR
+  UNSPEC_MSA_BINSRI
+  UNSPEC_MSA_BMNZ_V
+  UNSPEC_MSA_BMNZI_B
+  UNSPEC_MSA_BMZ_V
+  UNSPEC_MSA_BMZI_B
+  UNSPEC_MSA_BNEG
+  UNSPEC_MSA_BNEGI
+  UNSPEC_MSA_BSEL_V
+  UNSPEC_MSA_BSELI_B
+  UNSPEC_MSA_BSET
+  UNSPEC_MSA_BSETI
+  UNSPEC_MSA_BNZ_V
+  UNSPEC_MSA_BZ_V
+  UNSPEC_MSA_BNZ
+  UNSPEC_MSA_BZ
+  UNSPEC_MSA_CFCMSA
+  UNSPEC_MSA_CMPI
+  UNSPEC_MSA_CTCMSA
+  UNSPEC_MSA_DOTP_S
+  UNSPEC_MSA_DOTP_U
+  UNSPEC_MSA_DPADD_S
+  UNSPEC_MSA_DPADD_U
+  UNSPEC_MSA_DPSUB_S
+  UNSPEC_MSA_DPSUB_U
+  UNSPEC_MSA_FCAF
+  UNSPEC_MSA_FCLASS
+  UNSPEC_MSA_FCUNE
+  UNSPEC_MSA_FEXDO
+  UNSPEC_MSA_FEXP2
+  UNSPEC_MSA_FEXUPL
+  UNSPEC_MSA_FEXUPR
+  UNSPEC_MSA_FFINT_S
+  UNSPEC_MSA_FFINT_U
+  UNSPEC_MSA_FFQL
+  UNSPEC_MSA_FFQR
+  UNSPEC_MSA_FLOG2
+  UNSPEC_MSA_FMAX_A
+  UNSPEC_MSA_FMIN_A
+  UNSPEC_MSA_FRCP
+  UNSPEC_MSA_FRINT
+  UNSPEC_MSA_FRSQRT
+  UNSPEC_MSA_FSAF
+  UNSPEC_MSA_FSEQ
+  UNSPEC_MSA_FSLE
+  UNSPEC_MSA_FSLT
+  UNSPEC_MSA_FSNE
+  UNSPEC_MSA_FSOR
+  UNSPEC_MSA_FSUEQ
+  UNSPEC_MSA_FSULE
+  UNSPEC_MSA_FSULT
+  UNSPEC_MSA_FSUN
+  UNSPEC_MSA_FSUNE
+  UNSPEC_MSA_FTINT_S
+  UNSPEC_MSA_FTINT_U
+  UNSPEC_MSA_FTRUNC_S
+  UNSPEC_MSA_FTRUNC_U
+  UNSPEC_MSA_FTQ
+  UNSPEC_MSA_HADD_S
+  UNSPEC_MSA_HADD_U
+  UNSPEC_MSA_HSUB_S
+  UNSPEC_MSA_HSUB_U
+  UNSPEC_MSA_MADD_Q
+  UNSPEC_MSA_MADDR_Q
+  UNSPEC_MSA_MAX_A
+  UNSPEC_MSA_MAXI_S
+  UNSPEC_MSA_MAXI_U
+  UNSPEC_MSA_MIN_A
+  UNSPEC_MSA_MINI_S
+  UNSPEC_MSA_MINI_U
+  UNSPEC_MSA_MSUB_Q
+  UNSPEC_MSA_MSUBR_Q
+  UNSPEC_MSA_MUL_Q
+  UNSPEC_MSA_MULR_Q
+  UNSPEC_MSA_NLOC
+  UNSPEC_MSA_NLZC
+  UNSPEC_MSA_NORI_B
+  UNSPEC_MSA_ORI_B
+  UNSPEC_MSA_PCKEV
+  UNSPEC_MSA_PCKOD
+  UNSPEC_MSA_PCNT
+  UNSPEC_MSA_SAT_S
+  UNSPEC_MSA_SAT_U
+  UNSPEC_MSA_SHF
+  UNSPEC_MSA_SLD
+  UNSPEC_MSA_SLDI
+  UNSPEC_MSA_SLLI
+  UNSPEC_MSA_SPLAT
+  UNSPEC_MSA_SPLATI
+  UNSPEC_MSA_SRAI
+  UNSPEC_MSA_SRAR
+  UNSPEC_MSA_SRARI
+  UNSPEC_MSA_SRLI
+  UNSPEC_MSA_SRLR
+  UNSPEC_MSA_SRLRI
+  UNSPEC_MSA_SUBS_S
+  UNSPEC_MSA_SUBS_U
+  UNSPEC_MSA_SUBSUU_S
+  UNSPEC_MSA_SUBSUS_U
+  UNSPEC_MSA_SUBVI
+  UNSPEC_MSA_TSTNZ_V
+  UNSPEC_MSA_TSTZ_V
+  UNSPEC_MSA_TSTNZ
+  UNSPEC_MSA_TSTZ
+  UNSPEC_MSA_VSHF
+  UNSPEC_MSA_XORI_B
+  UNSPEC_MSA_CAST_TO_SCALAR
+  UNSPEC_MSA_CAST_TO_VECTOR
+])
+
+;; Attributes to categorize MSA instructions based on execution units
+(define_attr "msa_execunit"
+  "unknown, msa_eu_div, msa_eu_float2, msa_eu_float2_l,
+  msa_eu_float4, msa_eu_float5, msa_eu_float8, msa_eu_logic,
+  msa_eu_logic3, msa_eu_logic_l, msa_eu_mult, msa_eu_cmp,
+  msa_eu_store4, msa_eu_int_add, msa_eu_fdiv, msa_eu_logic_l2, msa_eu_logic2"
+  (const_string "unknown"))
+
+(define_attr "datafmt"
+  "unknown, d, w, h, b"
+  (const_string "unknown"))
+
+;; All vector modes with 128 bits.
+(define_mode_iterator MODE128  [V2DF V4SF V2DI V4SI V8HI V16QI])
+(define_mode_iterator MSA      [V2DF V4SF V2DI V4SI V8HI V16QI])
+
+;; Same as MSA.  Used by vcond to iterate two modes.
+(define_mode_iterator MSA_2    [V2DF V4SF V2DI V4SI V8HI V16QI])
+
+;; Only integer modes.
+(define_mode_iterator IMSA     [V2DI V4SI V8HI V16QI])
+
+;; mode that can combine a copy+insert into insve.
+(define_mode_iterator INSVE    [V2DI V4SI])
+
+;; mode that can be combine copy+insert with subreg info insve.
+(define_mode_iterator INSVE_2  [V8HI V16QI])
+
+;; As IMSA but excludes V16QI.
+(define_mode_iterator IMSA_X   [V2DI V4SI V8HI])
+
+;; As IMSA but excludes V2DI
+(define_mode_iterator IMSA_X2  [V4SI V8HI V16QI])
+
+;; Only integer modes for fixed-point madd_q/maddr_q.
+(define_mode_iterator QMSA     [V4SI V8HI])
+
+;; Only floating-point modes.
+(define_mode_iterator FMSA     [V2DF V4SF])
+
+;; Only integer modes for dot product.
+(define_mode_iterator IDOTP128 [V2DI V4SI V8HI])
+
+;; Only used in spliters
+(define_mode_iterator SPLIT [V2DI V2DF])
+
+(define_mode_attr DMSA
+  [(V2DI "V4DI")
+   (V4SI "V8SI")
+   (V8HI "V16HI")
+   (V16QI "V32QI")])
+
+;; Only used with SPLIT iterator
+(define_mode_attr predicate
+  [(V2DI "reg_or_0")
+   (V2DF "register")])
+
+(define_mode_attr VHALFMODE
+  [(V8HI "V16QI")
+   (V4SI "V8HI")
+   (V2DI "V4SI")
+   (V2DF "V4SF")])
+
+(define_mode_attr VDMODE
+  [(V4SI "V2DI")
+   (V8HI "V4SI")
+   (V16QI "V8HI")])
+
+;; The attribute gives the integer vector mode with same size.
+(define_mode_attr VIMODE
+  [(V2DF "V2DI")
+   (V4SF "V4SI")
+   (V2DI "V2DI")
+   (V4SI "V4SI")
+   (V8HI "V8HI")
+   (V16QI "V16QI")])
+
+;; This attribute gives the integer vector mode with same size.
+(define_mode_attr mode_i
+  [(V2DF "v2di")
+   (V4SF "v4si")
+   (V2DI "v2di")
+   (V4SI "v4si")
+   (V8HI "v8hi")
+   (V16QI "v16qi")])
+
+;; This attribute gives the mode of the result for "copy_s_b, copy_u_b" etc.
+(define_mode_attr RES
+  [(V2DF "DF")
+   (V4SF "SF")
+   (V2DI "DI")
+   (V4SI "SI")
+   (V8HI "SI")
+   (V16QI "SI")])
+
+;; This attribute gives suffix for MSA instructions.
+(define_mode_attr msafmt
+  [(V2DF "d")
+   (V4SF "w")
+   (V2DI "d")
+   (V4SI "w")
+   (V8HI "h")
+   (V16QI "b")])
+
+;; This is used in msa_cast* to output mov.s or mov.d.
+(define_mode_attr unitfmt
+  [(V2DF "d")
+   (V4SF "s")])
+
+;; This attribute gives define_insn suffix for MSA instructions with need
+;; distinction between integer and floating point.
+(define_mode_attr msafmt_f
+  [(V2DF "d_f")
+   (V4SF "w_f")
+   (V2DI "d")
+   (V4SI "w")
+   (V8HI "h")
+   (V16QI "b")])
+
+;; The mask for shift amounts.
+(define_mode_attr shift_mask
+  [(V2DI "63")
+   (V4SI "31")
+   (V8HI "15")
+   (V16QI "7")])
+
+;; This is used to form an immediate operand constraint using
+;; "const_<indeximm>_operand".
+(define_mode_attr indeximm
+  [(V2DF "0_or_1")
+   (V4SF "0_to_3")
+   (V2DI "0_or_1")
+   (V4SI "0_to_3")
+   (V8HI "uimm3")
+   (V16QI "uimm4")])
+
+;; To represent bitmask needed for vec_merge using "const_<bitmask>_operand".
+(define_mode_attr bitmask
+  [(V2DF "exp_2")
+   (V4SF "exp_4")
+   (V2DI "exp_2")
+   (V4SI "exp_4")
+   (V8HI "exp_8")
+   (V16QI "exp_16")])
+
+;; This attribute is used to form the MODE of an input operand
+;; when some builtins (insert snd fill) take an input operand other than
+;; UNITMODE mode. See the msa_insert and msa_fill for an examples.
+(define_mode_attr EXCEPT
+  [(V2DI "DI")
+   (V4SI "SI")
+   (V8HI "SI")
+   (V16QI "SI")])
+
+;; This attribute used to form an immediate operand constraint using
+;; "const_<bitimm>_operand".
+(define_mode_attr bitimm
+  [(V16QI "uimm3")
+   (V8HI  "uimm4")
+   (V4SI  "uimm5")
+   (V2DI  "uimm6")
+  ])
+
+(define_expand "vec_init<mode>"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand:MSA 1 "")]
+  "ISA_HAS_MSA"
+{
+  mips_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
+(define_mode_attr hmsafmt
+  [(V2DI "w")
+   (V4SI "h")
+   (V8HI "b")])
+
+(define_expand "fixuns_trunc<FMSA:mode><mode_i>2"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(unsigned_fix:<VIMODE> (match_operand:FMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  {
+    emit_insn (gen_msa_ftrunc_u_<msafmt> (operands[0], operands[1]));
+    DONE;
+  })
+
+(define_expand "fix_trunc<FMSA:mode><mode_i>2"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(fix:<VIMODE> (match_operand:FMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  {
+    emit_insn (gen_msa_ftrunc_s_<msafmt> (operands[0], operands[1]));
+    DONE;
+  })
+
+(define_expand "vec_pack_trunc_v2df"
+  [(set (match_operand:V4SF 0 "register_operand")
+	(vec_concat:V4SF
+	  (float_truncate:V2SF (match_operand:V2DF 1 "register_operand"))
+	  (float_truncate:V2SF (match_operand:V2DF 2 "register_operand"))))]
+  "ISA_HAS_MSA"
+  "")
+
+;; pckev pattern with implicit type conversion.
+(define_insn "vec_pack_trunc_<mode>"
+   [(set (match_operand:<VHALFMODE> 0 "register_operand" "=f")
+	(unspec:<VHALFMODE> [(match_operand:IMSA_X 1 "register_operand" "f")
+			     (match_operand:IMSA_X 2 "register_operand" "f")]
+		UNSPEC_MSA_PCKEV))]
+  "ISA_HAS_MSA"
+  "pckev.<hmsafmt>\t%w0,%w2,%w1"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_expand "vec_unpacks_hi_v4sf"
+  [(set (match_operand:V2DF 0 "register_operand" "=f")
+	(float_extend:V2DF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "register_operand" "f")
+	    (parallel [(const_int 0) (const_int 1)])
+	  )))]
+  "ISA_HAS_MSA"
+  {
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_msa_fexupr_d (operands[0], operands[1]));
+    else
+      emit_insn (gen_msa_fexupl_d (operands[0], operands[1]));
+    DONE;
+  })
+
+(define_expand "vec_unpacks_lo_v4sf"
+  [(set (match_operand:V2DF 0 "register_operand" "=f")
+	(float_extend:V2DF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "register_operand" "f")
+	    (parallel [(const_int 0) (const_int 1)])
+	  )))]
+  "ISA_HAS_MSA"
+  {
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_msa_fexupl_d (operands[0], operands[1]));
+    else
+      emit_insn (gen_msa_fexupr_d (operands[0], operands[1]));
+    DONE;
+  })
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(set (match_operand:<VDMODE> 0 "register_operand")
+	(match_operand:IMSA_X2 1 "register_operand"))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_vec_unpack (operands, false/*unsigned_p*/, true/*high_p*/);
+    DONE;
+  })
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(set (match_operand:<VDMODE> 0 "register_operand")
+	(match_operand:IMSA_X2 1 "register_operand"))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_vec_unpack (operands, false/*unsigned_p*/, false/*high_p*/);
+    DONE;
+  })
+
+(define_expand "vec_unpacku_hi_<mode>"
+  [(set (match_operand:<VDMODE> 0 "register_operand")
+	(match_operand:IMSA_X2 1 "register_operand"))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_vec_unpack (operands, true/*unsigned_p*/, true/*high_p*/);
+    DONE;
+  })
+
+(define_expand "vec_unpacku_lo_<mode>"
+  [(set (match_operand:<VDMODE> 0 "register_operand")
+	(match_operand:IMSA_X2 1 "register_operand"))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_vec_unpack (operands, true/*unsigned_p*/, false/*high_p*/);
+    DONE;
+  })
+
+(define_expand "vec_extract<mode>"
+  [(match_operand:<UNITMODE> 0 "register_operand")
+   (match_operand:IMSA 1 "register_operand")
+   (match_operand 2 "const_<indeximm>_operand")]
+  "ISA_HAS_MSA"
+{
+  if (<UNITMODE>mode == QImode || <UNITMODE>mode == HImode)
+    {
+      rtx dest1 = gen_reg_rtx (SImode);
+      emit_insn (gen_msa_copy_s_<msafmt> (dest1, operands[1], operands[2]));
+      emit_move_insn (operands[0],
+		      gen_lowpart (<UNITMODE>mode, dest1));
+    }
+  else
+    emit_insn (gen_msa_copy_s_<msafmt> (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "vec_extract<mode>"
+  [(match_operand:<UNITMODE> 0 "register_operand")
+   (match_operand:FMSA 1 "register_operand")
+   (match_operand 2 "const_<indeximm>_operand")]
+  "ISA_HAS_MSA"
+{
+  rtx temp;
+  HOST_WIDE_INT val = UINTVAL (operands[2]);
+
+  if (val == 0)
+    temp = operands[1];
+  else
+    {
+      /* We need to do the SLDI operation in V16QImode and adjust
+	 operand[2] accordingly.  */
+      rtx tempb = gen_reg_rtx (V16QImode);
+      rtx op1b = gen_reg_rtx (V16QImode);
+      emit_move_insn (op1b, gen_rtx_SUBREG (V16QImode, operands[1], 0));
+      rtx op2b = GEN_INT (val * GET_MODE_SIZE (<UNITMODE>mode));
+      gcc_assert (UINTVAL (op2b) < GET_MODE_NUNITS (V16QImode));
+      emit_insn (gen_msa_sldi_b (tempb, op1b, op1b, op2b));
+      temp = gen_reg_rtx (<MODE>mode);
+      emit_move_insn (temp, gen_rtx_SUBREG (<MODE>mode, tempb, 0));
+    }
+  emit_insn (gen_msa_cast_to_scalar_<msafmt_f> (operands[0], temp));
+  DONE;
+})
+
+(define_expand "vec_set<mode>"
+  [(match_operand:IMSA 0 "register_operand")
+   (match_operand:<UNITMODE> 1 "reg_or_0_operand")
+   (match_operand 2 "const_<indeximm>_operand")]
+  "ISA_HAS_MSA"
+{
+  emit_insn (gen_msa_insert_<msafmt>_insn (operands[0], operands[1],
+			   operands[0], GEN_INT(1 << INTVAL (operands[2]))));
+  DONE;
+})
+
+(define_expand "vec_set<mode>"
+  [(match_operand:FMSA 0 "register_operand")
+   (match_operand:<UNITMODE> 1 "register_operand")
+   (match_operand 2 "const_<indeximm>_operand")]
+  "ISA_HAS_MSA"
+{
+  emit_insn (gen_msa_insve_<msafmt_f>_s (operands[0], operands[0],
+			GEN_INT(1 << INTVAL (operands[2])), operands[1]));
+  DONE;
+})
+
+(define_expand "vcondu<MSA_2:mode><IMSA:mode>"
+  [(set (match_operand:MSA_2 0 "register_operand")
+	(if_then_else:MSA_2
+	  (match_operator 3 ""
+	    [(match_operand:IMSA 4 "register_operand")
+	     (match_operand:IMSA 5 "register_operand")])
+	  (match_operand:MSA_2 1 "reg_or_m1_operand")
+	  (match_operand:MSA_2 2 "reg_or_0_operand")))]
+  "ISA_HAS_MSA
+   && (GET_MODE_NUNITS (<MSA_2:MODE>mode)
+       == GET_MODE_NUNITS (<IMSA:MODE>mode))"
+{
+  mips_expand_vec_cond_expr (<MSA_2:MODE>mode,
+			     <MSA_2:VIMODE>mode,
+			     operands,
+			     gen_and<MSA_2:mode_i>3,
+			     gen_msa_nor_v_<MSA_2:msafmt>,
+			     gen_ior<MSA_2:mode_i>3);
+  DONE;
+})
+
+(define_expand "vcond<MSA_2:mode><MSA:mode>"
+  [(set (match_operand:MSA_2 0 "register_operand")
+	(if_then_else:MSA_2
+	  (match_operator 3 ""
+	    [(match_operand:MSA 4 "register_operand")
+	     (match_operand:MSA 5 "register_operand")])
+	  (match_operand:MSA_2 1 "reg_or_m1_operand")
+	  (match_operand:MSA_2 2 "reg_or_0_operand")))]
+  "ISA_HAS_MSA
+   && (GET_MODE_NUNITS (<MSA_2:MODE>mode)
+       == GET_MODE_NUNITS (<MSA:MODE>mode))"
+{
+  mips_expand_vec_cond_expr (<MSA_2:MODE>mode,
+			     <MSA_2:VIMODE>mode,
+			     operands,
+			     gen_and<MSA_2:mode_i>3,
+			     gen_msa_nor_v_<MSA_2:msafmt>,
+			     gen_ior<MSA_2:mode_i>3);
+  DONE;
+})
+
+;; Note used directly by builtins but via the following define_expand.
+(define_insn "msa_insert_<msafmt>_insn"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(vec_merge:IMSA (vec_duplicate:IMSA
+		            (match_operand:<UNITMODE> 1 "reg_or_0_operand" "dJ"))
+			(match_operand:IMSA 2 "register_operand" "0")
+			(match_operand 3 "const_<bitmask>_operand" "")))]
+  "ISA_HAS_MSA"
+  "insert.<msafmt>\t%w0[%K3],%z1"
+  [(set_attr "type"     "mtc")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; Expand builtin catoring for HImode and QImode which take SImode.
+(define_expand "msa_insert_<msafmt>"
+  [(match_operand:IMSA 0 "register_operand")
+   (match_operand:IMSA 1 "register_operand")
+   (match_operand 2 "const_<indeximm>_operand")
+   (match_operand:<EXCEPT> 3 "reg_or_0_operand")]
+  "ISA_HAS_MSA"
+{
+  if ((GET_MODE_SIZE (<UNITMODE>mode) < GET_MODE_SIZE (<EXCEPT>mode))
+      && (REG_P (operands[3]) || (GET_CODE (operands[3]) == SUBREG
+				  && REG_P (SUBREG_REG (operands[3])))))
+    {
+      unsigned int offset = GET_MODE_SIZE (<EXCEPT>mode)
+			    - GET_MODE_SIZE (<UNITMODE>mode);
+      operands[3] = simplify_gen_subreg (<UNITMODE>mode, operands[3],
+					 GET_MODE (operands[3]),
+					 BYTES_BIG_ENDIAN ? offset : 0);
+    }
+  emit_insn (gen_msa_insert_<msafmt>_insn (operands[0], operands[3],
+		   operands[1], GEN_INT(1 << INTVAL (operands[2]))));
+  DONE;
+})
+
+(define_expand "msa_insert_<msafmt_f>"
+  [(match_operand:FMSA 0 "register_operand")
+   (match_operand:FMSA 1 "register_operand")
+   (match_operand 2 "const_<indeximm>_operand")
+   (match_operand:<UNITMODE> 3 "reg_or_0_operand")]
+  "ISA_HAS_MSA"
+{
+  emit_insn (gen_msa_insert_<msafmt_f>_insn (operands[0], operands[3],
+		   operands[1], GEN_INT(1 << INTVAL (operands[2]))));
+  DONE;
+})
+
+(define_insn "msa_insert_<msafmt_f>_insn"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(vec_merge:FMSA (vec_duplicate:FMSA
+		            (match_operand:<UNITMODE> 1 "register_operand" "d"))
+			(match_operand:FMSA 2 "register_operand" "0")
+			(match_operand 3 "const_<bitmask>_operand" "")))]
+  "ISA_HAS_MSA"
+  "insert.<msafmt>\t%w0[%K3],%z1"
+  [(set_attr "type"     "mtc")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_split
+  [(set (match_operand:SPLIT 0 "register_operand")
+	(vec_merge:SPLIT
+		(vec_duplicate:SPLIT
+		    (match_operand:<UNITMODE> 1 "<SPLIT:predicate>_operand"))
+		(match_operand:SPLIT 2 "register_operand")
+		(match_operand 3 "const_<bitmask>_operand")))]
+  "reload_completed && TARGET_MSA && !TARGET_64BIT"
+  [(const_int 0)]
+{
+  mips_split_msa_insert_d (operands[0], operands[2], operands[3], operands[1]);
+  DONE;
+})
+
+;; Used by combine to convert a copy_s + insert into an insve
+(define_insn "msa_insve_s_insn_<msafmt>"
+  [(set (match_operand:INSVE 0 "register_operand" "=f")
+        (vec_merge:INSVE
+	    (vec_duplicate:INSVE
+		(sign_extend:<UNITMODE>
+		    (vec_select:<UNITMODE>
+			 (match_operand:INSVE 3 "register_operand" "f")
+			 (parallel [(match_operand 4 "const_0_operand" "")]))))
+	    (match_operand:INSVE 1 "register_operand" "0")
+	    (match_operand 2 "const_<bitmask>_operand" "")))]
+   "ISA_HAS_MSA"
+   "insve.<msafmt>\t%w0[%K2],%w3[0]"
+   [(set_attr "type"     "arith")
+    (set_attr "mode"     "TI")
+    (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; Used by combine to convert a copy_u + insert into an insve
+(define_insn "msa_insve_u_insn_<msafmt>"
+  [(set (match_operand:INSVE 0 "register_operand" "=f")
+        (vec_merge:INSVE
+	    (vec_duplicate:INSVE
+		(zero_extend:<UNITMODE>
+		    (vec_select:<UNITMODE>
+			 (match_operand:INSVE 3 "register_operand" "f")
+			 (parallel [(match_operand 4 "const_0_operand" "")]))))
+	    (match_operand:INSVE 1 "register_operand" "0")
+	    (match_operand 2 "const_<bitmask>_operand" "")))]
+   "ISA_HAS_MSA"
+   "insve.<msafmt>\t%w0[%K2],%w3[0]"
+   [(set_attr "type"     "arith")
+    (set_attr "mode"     "TI")
+    (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; Used by combine to convert a copy_s + insert with subreg into an insve
+(define_insn "*msa_insve_sext_insn_<msafmt>"
+  [(set (match_operand:INSVE_2 0 "register_operand" "=f")
+        (vec_merge:INSVE_2
+	    (vec_duplicate:INSVE_2
+		(subreg:<UNITMODE>
+		  (sign_extend:<RES>
+		    (vec_select:<UNITMODE>
+			 (match_operand:INSVE_2 3 "register_operand" "f")
+			 (parallel [(match_operand 4 "const_0_operand" "")]))) 0))
+	    (match_operand:INSVE_2 1 "register_operand" "0")
+	    (match_operand 2 "const_<bitmask>_operand" "")))]
+   "ISA_HAS_MSA"
+   "insve.<msafmt>\t%w0[%K2],%w3[0]"
+   [(set_attr "type"     "arith")
+    (set_attr "mode"     "TI")
+    (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; Used by combine to convert a copy_u + insert with subreg into an insve
+(define_insn "*msa_insve_zext_insn_<msafmt>"
+  [(set (match_operand:INSVE_2 0 "register_operand" "=f")
+        (vec_merge:INSVE_2
+	    (vec_duplicate:INSVE_2
+		(subreg:<UNITMODE>
+		  (zero_extend:<RES>
+		    (vec_select:<RES>
+			 (match_operand:INSVE_2 3 "register_operand" "f")
+			 (parallel [(match_operand 4 "const_0_operand" "")]))) 0))
+	    (match_operand:INSVE_2 1 "register_operand" "0")
+	    (match_operand 2 "const_<bitmask>_operand" "")))]
+   "ISA_HAS_MSA"
+   "insve.<msafmt>\t%w0[%K2],%w3[0]"
+   [(set_attr "type"     "arith")
+    (set_attr "mode"     "TI")
+    (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_expand "msa_insve_<msafmt_f>"
+  [(set (match_operand:MSA 0 "register_operand")
+        (vec_merge:MSA (vec_duplicate:MSA
+			     (vec_select:<UNITMODE>
+				 (match_operand:MSA 3 "register_operand")
+				 (parallel [(const_int 0)])))
+			 (match_operand:MSA 1 "register_operand")
+			 (match_operand 2 "const_<indeximm>_operand")))]
+  "ISA_HAS_MSA"
+  {
+    operands[2] = GEN_INT ((1 << INTVAL (operands[2])));
+  })
+
+(define_insn "msa_insve_<msafmt_f>_insn"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+        (vec_merge:MSA (vec_duplicate:MSA
+			     (vec_select:<UNITMODE>
+				 (match_operand:MSA 3 "register_operand" "f")
+				 (parallel [(const_int 0)])))
+			 (match_operand:MSA 1 "register_operand" "0")
+			 (match_operand 2 "const_<bitmask>_operand" "")))]
+  "ISA_HAS_MSA"
+  "insve.<msafmt>\t%w0[%K2],%w3[0]"
+  [(set_attr "type"     "arith")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; operand 3 is a scalar
+(define_insn "msa_insve_<msafmt>_f_s"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+        (vec_merge:FMSA (vec_duplicate:FMSA
+			    (match_operand:<UNITMODE> 3 "register_operand" "f"))
+			 (match_operand:FMSA 1 "register_operand" "0")
+			 (match_operand 2 "const_<bitmask>_operand" "")))]
+  "ISA_HAS_MSA"
+  "insve.<msafmt>\t%w0[%K2],%w3[0]"
+  [(set_attr "type"     "arith")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+;; Note that copy_s.d and copy_s.d_f will be split later if !TARGET_64BIT.
+(define_insn "msa_copy_s_<msafmt_f>"
+  [(set (match_operand:<RES> 0 "register_operand" "=d")
+	(sign_extend:<RES>
+	    (vec_select:<UNITMODE>
+		(match_operand:MSA 1 "register_operand" "f")
+	        (parallel [(match_operand 2 "const_<indeximm>_operand" "")]))))]
+  "ISA_HAS_MSA"
+  "copy_s.<msafmt>\t%0,%w1[%2]"
+  [(set_attr "type"     "mfc")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_split
+  [(set (match_operand:<UNITMODE> 0 "register_operand")
+	(sign_extend:<UNITMODE>
+	    (vec_select:<UNITMODE>
+		(match_operand:SPLIT 1 "register_operand")
+		(parallel [(match_operand 2 "const_0_or_1_operand")]))))]
+  "reload_completed && TARGET_MSA && !TARGET_64BIT"
+  [(const_int 0)]
+{
+  mips_split_msa_copy_d (operands[0], operands[1], operands[2], gen_msa_copy_s_w);
+  DONE;
+})
+
+;; Note that copy_u.d and copy_u.d_f will be split later if !TARGET_64BIT.
+(define_insn "msa_copy_u_<msafmt_f>"
+  [(set (match_operand:<RES> 0 "register_operand" "=d")
+	(zero_extend:<RES>
+	    (vec_select:<UNITMODE>
+		(match_operand:MSA 1 "register_operand" "f")
+		(parallel [(match_operand 2 "const_<indeximm>_operand" "")]))))]
+  "ISA_HAS_MSA"
+  "copy_u.<msafmt>\t%0,%w1[%2]"
+  [(set_attr "type"     "mfc")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_split
+  [(set (match_operand:<UNITMODE> 0 "register_operand")
+	(zero_extend:<UNITMODE>
+	    (vec_select:<UNITMODE>
+		(match_operand:SPLIT 1 "register_operand")
+		(parallel [(match_operand 2 "const_0_or_1_operand")]))))]
+  "reload_completed && TARGET_MSA && !TARGET_64BIT"
+  [(const_int 0)]
+{
+  mips_split_msa_copy_d (operands[0], operands[1], operands[2], gen_msa_copy_u_w);
+  DONE;
+})
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand:MSA 1 "register_operand")
+   (match_operand:MSA 2 "register_operand")
+   (match_operand:<VIMODE> 3 "register_operand")]
+  "ISA_HAS_MSA"
+{
+  /* The optab semantics are that index 0 selects the first element
+     of operands[1] and the highest index selects the last element
+     of operands[2].  This is the oppossite order from "vshf.df wd,rs,wt"
+     where index 0 selects the first element of wt and the highest index
+     selects the last element of ws.  We therefore swap the operands here.  */
+  emit_insn (gen_msa_vshf<mode> (operands[0], operands[3], operands[2],
+				 operands[1]));
+  DONE;
+})
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand:MSA 1 "register_operand")
+   (match_operand:MSA 2 "register_operand")
+   (match_operand:<VIMODE> 3 "")]
+  "ISA_HAS_MSA"
+{
+  if (mips_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_expand "abs<mode>2"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(abs:IMSA (match_operand:IMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+{
+  rtx reg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_msa_ldi<mode> (reg, const0_rtx));
+  emit_insn (gen_msa_add_a_<msafmt> (operands[0], operands[1], reg));
+  DONE;
+})
+
+(define_expand "neg<mode>2"
+  [(match_operand:IMSA 0 "register_operand")
+   (match_operand:IMSA 1 "register_operand")]
+  "ISA_HAS_MSA"
+{
+  rtx reg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_msa_ldi<mode> (reg, const0_rtx));
+  emit_insn (gen_sub<mode>3 (operands[0], reg, operands[1]));
+  DONE;
+})
+
+(define_expand "neg<mode>2"
+  [(match_operand:FMSA 0 "register_operand")
+   (match_operand:FMSA 1 "register_operand")]
+  "ISA_HAS_MSA"
+{
+  rtx reg = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (reg, CONST0_RTX (<MODE>mode));
+  emit_insn (gen_sub<mode>3 (operands[0], reg, operands[1]));
+  DONE;
+})
+
+(define_expand "msa_ldi<mode>"
+  [(match_operand:IMSA 0 "register_operand")
+   (match_operand 1 "const_imm10_operand")]
+  "ISA_HAS_MSA"
+  {
+    unsigned n_elts = GET_MODE_NUNITS (<MODE>mode);
+    rtvec v = rtvec_alloc (n_elts);
+    HOST_WIDE_INT val = INTVAL (operands[1]);
+    unsigned int i;
+
+    if (<MODE>mode != V16QImode)
+      {
+	unsigned shift = HOST_BITS_PER_WIDE_INT - 10;
+	val = trunc_int_for_mode ((val << shift) >> shift, <UNITMODE>mode);
+      }
+    else
+      val = trunc_int_for_mode (val, <UNITMODE>mode);
+
+    for (i = 0; i < n_elts; i++)
+      RTVEC_ELT (v, i) = GEN_INT (val);
+    emit_move_insn (operands[0],
+		    gen_rtx_CONST_VECTOR (<MODE>mode, v));
+    DONE;
+  })
+
+(define_insn "msa_vshf<mode>"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+	(unspec:MSA [(match_operand:<VIMODE> 1 "register_operand" "0")
+		     (match_operand:MSA 2 "register_operand" "f")
+		     (match_operand:MSA 3 "register_operand" "f")]
+		    UNSPEC_MSA_VSHF))]
+  "ISA_HAS_MSA"
+  "vshf.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_expand "mov<mode>"
+  [(set (match_operand:MSA 0)
+	(match_operand:MSA 1))]
+  "TARGET_MSA"
+{
+  if (mips_legitimize_move (<MODE>mode, operands[0], operands[1]))
+    DONE;
+})
+
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:MSA 0)
+	(match_operand:MSA 1))]
+  "TARGET_MSA"
+{
+  if (mips_legitimize_move (<MODE>mode, operands[0], operands[1]))
+    DONE;
+})
+
+;; 128bit MSA modes only in msa registers or memory.  An exception is allowing
+;; MSA modes for GP registers for arguments and return values.
+(define_insn "mov<mode>_msa"
+  [(set (match_operand:MSA 0 "nonimmediate_operand" "=f,f,R,*d,*f")
+	(match_operand:MSA 1 "move_operand" "fYGYI,R,f,*f,*d"))]
+  "TARGET_MSA"
+{ return mips_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "fmove,fpload,fpstore,mfc,mtc")
+   (set_attr "mode" "TI")])
+
+(define_split
+  [(set (match_operand:MSA 0 "nonimmediate_operand")
+	(match_operand:MSA 1 "move_operand"))]
+  "reload_completed && TARGET_MSA
+   && mips_split_move_insn_p (operands[0], operands[1], insn)"
+  [(const_int 0)]
+{
+  mips_split_move_insn (operands[0], operands[1], curr_insn);
+  DONE;
+})
+
+;; Offset load
+(define_expand "msa_ld_<msafmt_f>"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand 1 "pmode_register_operand")
+   (match_operand 2 "aq10<msafmt>_operand")]
+  "ISA_HAS_MSA"
+{
+  rtx addr = plus_constant (GET_MODE (operands[1]), operands[1],
+			    INTVAL (operands[2]));
+  mips_emit_move (operands[0], gen_rtx_MEM (<MODE>mode, addr));
+  DONE;
+})
+
+;; Offset store
+(define_expand "msa_st_<msafmt_f>"
+  [(match_operand:MSA 0 "register_operand")
+   (match_operand 1 "pmode_register_operand")
+   (match_operand 2 "aq10<msafmt>_operand")]
+  "ISA_HAS_MSA"
+{
+  rtx addr = plus_constant (GET_MODE (operands[1]), operands[1],
+			    INTVAL (operands[2]));
+  mips_emit_move (gen_rtx_MEM (<MODE>mode, addr), operands[0]);
+  DONE;
+})
+
+;; Integer operations
+(define_insn "add<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f,f,f")
+	(plus:IMSA (match_operand:IMSA 1 "register_operand" "f,f,f")
+		   (match_operand:IMSA 2 "reg_or_vector_same_ximm5_operand" "f,Unv5,Uuv5")))]
+  "ISA_HAS_MSA"
+  {
+    switch (which_alternative)
+      {
+      case 0:
+	return "addv.<msafmt>\t%w0,%w1,%w2";
+      case 1:
+	{
+	  HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+
+	  operands[2] = GEN_INT (-val);
+	  return "subvi.<msafmt>\t%w0,%w1,%d2";
+	}
+      case 2:
+	{
+	  HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+
+	  operands[2] = GEN_INT (val);
+	  return "addvi.<msafmt>\t%w0,%w1,%d2";
+	}
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "alu_type"	"add, unknown, unknown")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt"	"<msafmt>")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f,f,f")
+	(minus:IMSA (match_operand:IMSA 1 "register_operand" "f,f,f")
+		    (match_operand:IMSA 2 "reg_or_vector_same_ximm5_operand" "f,Unv5,Uuv5")))]
+  "ISA_HAS_MSA"
+  {
+    switch (which_alternative)
+      {
+      case 0:
+	return "subv.<msafmt>\t%w0,%w1,%w2";
+      case 1:
+	{
+	  HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+
+	  operands[2] = GEN_INT (-val);
+	  return "addvi.<msafmt>\t%w0,%w1,%d2";
+	}
+      case 2:
+	{
+	  HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+
+	  operands[2] = GEN_INT (val);
+	  return "subvi.<msafmt>\t%w0,%w1,%d2";
+	}
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "alu_type"	"sub, unknown, unknown")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt"  "<msafmt>")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(mult:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "mulv.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"imul3")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_maddv_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(plus:IMSA (mult:IMSA (match_operand:IMSA 2 "register_operand" "f")
+			      (match_operand:IMSA 3 "register_operand" "f"))
+		   (match_operand:IMSA 1 "register_operand" "0")))]
+  "ISA_HAS_MSA"
+  "maddv.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"     "imadd")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_msubv_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(minus:IMSA (match_operand:IMSA 1 "register_operand" "0")
+		    (mult:IMSA (match_operand:IMSA 2 "register_operand" "f")
+			       (match_operand:IMSA 3 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "msubv.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"     "imadd")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "div<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(div:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		  (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  { return mips_msa_output_division ("div_s.<msafmt>\t%w0,%w1,%w2", operands); }
+  [(set_attr "type"	"idiv3")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt" "<msafmt>")
+   (set_attr "msa_execunit" "msa_eu_div")])
+
+(define_insn "udiv<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(udiv:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  { return mips_msa_output_division ("div_u.<msafmt>\t%w0,%w1,%w2", operands); }
+  [(set_attr "type"	"idiv3")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt" "<msafmt>")
+   (set_attr "msa_execunit" "msa_eu_div")])
+
+(define_insn "mod<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(mod:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		    (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  { return mips_msa_output_division ("mod_s.<msafmt>\t%w0,%w1,%w2", operands); }
+  [(set_attr "type"	"idiv3")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt" "<msafmt>")
+   (set_attr "msa_execunit" "msa_eu_div")])
+
+(define_insn "umod<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(umod:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		     (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  { return mips_msa_output_division ("mod_u.<msafmt>\t%w0,%w1,%w2", operands); }
+  [(set_attr "type"	"idiv3")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt" "<msafmt>")
+   (set_attr "msa_execunit" "msa_eu_div")])
+
+(define_insn "xorv16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=f,f")
+	(xor:V16QI (match_operand:V16QI 1 "register_operand" "f,f")
+		   (match_operand:V16QI 2 "reg_or_vector_same_byte_operand" "f,Ubv8")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	operands[2] = CONST_VECTOR_ELT (operands[2], 0);
+	return "xori.b\t%w0,%w1,%B2";
+      }
+    else
+      return "xor.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"xor")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "xor<mode>3"
+  [(set (match_operand:IMSA_X 0 "register_operand" "=f,f")
+	(xor:IMSA_X (match_operand:IMSA_X 1 "register_operand" "f,f")
+		    (match_operand:IMSA_X 2 "reg_or_vector_same_<mode>_set_operand" "f,YC")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+	int vlog2 = exact_log2 (val);
+	gcc_assert (vlog2 != -1);
+	operands[2] = GEN_INT (vlog2);
+	return "bnegi.%v0\t%w0,%w1,%2";
+      }
+    else
+      return "xor.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"xor")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (and (eq_attr "cpu" "i6400")
+			   (eq_attr "alternative" "1"))
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "iorv16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=f,f")
+	(ior:V16QI (match_operand:V16QI 1 "register_operand" "f,f")
+		   (match_operand:V16QI 2 "reg_or_vector_same_byte_operand" "f,Ubv8")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	operands[2] = CONST_VECTOR_ELT (operands[2], 0);
+	return "ori.b\t%w0,%w1,%B2";
+      }
+    else
+      return "or.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"or")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "ior<mode>3"
+  [(set (match_operand:IMSA_X 0 "register_operand" "=f,f")
+	(ior:IMSA_X (match_operand:IMSA_X 1 "register_operand" "f,f")
+		    (match_operand:IMSA_X 2 "reg_or_vector_same_<mode>_set_operand" "f,YC")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+	int vlog2 = exact_log2 (val);
+	gcc_assert (vlog2 != -1);
+	operands[2] = GEN_INT (vlog2);
+	return "bseti.%v0\t%w0,%w1,%2";
+      }
+    else
+      return "or.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"or")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (and (eq_attr "cpu" "i6400")
+			   (eq_attr "alternative" "1"))
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "andv16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=f,f")
+	(and:V16QI (match_operand:V16QI 1 "register_operand" "f,f")
+		   (match_operand:V16QI 2 "reg_or_vector_same_byte_operand" "f,Ubv8")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	operands[2] = CONST_VECTOR_ELT (operands[2], 0);
+	return "andi.b\t%w0,%w1,%B2";
+      }
+    else
+      return "and.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"and")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "and<mode>3"
+  [(set (match_operand:IMSA_X 0 "register_operand" "=f,f")
+	(and:IMSA_X (match_operand:IMSA_X 1 "register_operand" "f,f")
+		    (match_operand:IMSA_X 2 "reg_or_vector_same_<mode>_clr_operand" "f,YZ")))]
+  "ISA_HAS_MSA"
+  {
+    if (which_alternative == 1)
+      {
+	HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (operands[2], 0));
+	int vlog2 = exact_log2 (~val);
+	gcc_assert (vlog2 != -1);
+	operands[2] = GEN_INT (vlog2);
+	return "bclri.%v0\t%w0,%w1,%2";
+      }
+     else
+       return "and.v\t%w0,%w1,%w2";
+  }
+  [(set_attr "alu_type"	"and")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (and (eq_attr "cpu" "i6400")
+			   (eq_attr "alternative" "1"))
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "one_cmpl<mode>2"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(not:IMSA (match_operand:IMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "nor.v\t%w0,%w1,%w1"
+  [(set_attr "alu_type"	"nor")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "vlshr<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f,f")
+	(lshiftrt:IMSA (match_operand:IMSA 1 "register_operand" "f,f")
+			 (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))]
+  "ISA_HAS_MSA"
+{
+  if (which_alternative == 0)
+    return "srl.<msafmt>\t%w0,%w1,%w2";
+
+  operands[2] = GEN_INT (INTVAL (CONST_VECTOR_ELT (operands[2], 0))
+			 & <shift_mask>);
+  return "srli.<msafmt>\t%w0,%w1,%2";
+}
+  [(set_attr "type"	"shift")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "vashr<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f,f")
+	(ashiftrt:IMSA (match_operand:IMSA 1 "register_operand" "f,f")
+		       (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))]
+  "ISA_HAS_MSA"
+{
+  if (which_alternative == 0)
+    return "sra.<msafmt>\t%w0,%w1,%w2";
+
+  operands[2] = GEN_INT (INTVAL (CONST_VECTOR_ELT (operands[2], 0))
+			 & <shift_mask>);
+  return "srai.<msafmt>\t%w0,%w1,%2";
+}
+  [(set_attr "type"	"shift")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "vashl<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f,f")
+	(ashift:IMSA (match_operand:IMSA 1 "register_operand" "f,f")
+		     (match_operand:IMSA 2 "reg_or_vector_same_uimm6_operand" "f,Uuv6")))]
+  "ISA_HAS_MSA"
+{
+  if (which_alternative == 0)
+    return "sll.<msafmt>\t%w0,%w1,%w2";
+
+  operands[2] = GEN_INT (INTVAL (CONST_VECTOR_ELT (operands[2], 0))
+			 & <shift_mask>);
+  return "slli.<msafmt>\t%w0,%w1,%2";
+}
+  [(set_attr "type"	"shift")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+;; Floating-point operations
+(define_insn "add<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(plus:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fadd.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(minus:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		    (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fsub.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "mul<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(mult:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fmul.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fmul")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float5")])
+
+(define_insn "div<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(div:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		  (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fdiv.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fdiv")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_fdiv")])
+
+(define_insn "msa_fmadd_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(plus:FMSA (mult:FMSA (match_operand:FMSA 2 "register_operand" "f")
+			      (match_operand:FMSA 3 "register_operand" "f"))
+		   (match_operand:FMSA 1 "register_operand" "0")))]
+  "ISA_HAS_MSA"
+  "fmadd.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"     "fmadd")
+   (set_attr "mode"     "<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float8")])
+
+(define_insn "msa_fmsub_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(minus:FMSA (match_operand:FMSA 1 "register_operand" "0")
+		    (mult:FMSA (match_operand:FMSA 2 "register_operand" "f")
+			       (match_operand:FMSA 3 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "fmsub.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"     "fmadd")
+   (set_attr "mode"     "<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float8")])
+
+(define_insn "sqrt<mode>2"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(sqrt:FMSA (match_operand:FMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fsqrt.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fsqrt")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_fdiv")])
+
+;; Built-in functions
+(define_insn "msa_add_a_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(plus:IMSA (abs:IMSA (match_operand:IMSA 1 "register_operand" "f"))
+		   (abs:IMSA (match_operand:IMSA 2 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "add_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_adds_a_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(ss_plus:IMSA (abs:IMSA (match_operand:IMSA 1 "register_operand" "f"))
+		      (abs:IMSA (match_operand:IMSA 2 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "adds_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "ssadd<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(ss_plus:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "adds_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "usadd<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(us_plus:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "adds_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_expand "msa_addvi_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand")
+	(plus:IMSA (match_operand:IMSA 1 "register_operand")
+		   (match_operand 2 "const_uimm5_operand")))]
+  "ISA_HAS_MSA"
+  {
+    unsigned n_elts = GET_MODE_NUNITS (<MODE>mode);
+    rtvec v = rtvec_alloc (n_elts);
+    HOST_WIDE_INT val = INTVAL (operands[2]);
+    unsigned int i;
+
+    for (i = 0; i < n_elts; i++)
+      RTVEC_ELT (v, i) = GEN_INT (val);
+
+    emit_insn (gen_msa_addvi_<msafmt>_insn (operands[0], operands[1],
+                gen_rtx_CONST_VECTOR (<MODE>mode, v)));
+    DONE;
+  })
+
+(define_expand "msa_andi_b"
+  [(set (match_operand:V16QI 0 "register_operand")
+	(and:V16QI (match_operand:V16QI 1 "register_operand")
+		   (match_operand:QI 2 "const_uimm8_operand")))]
+  "ISA_HAS_MSA"
+  {
+    rtvec v = rtvec_alloc (16);
+    HOST_WIDE_INT val = INTVAL (operands[2]);
+    unsigned int i;
+
+    for (i = 0; i < 16; i++)
+      RTVEC_ELT (v, i) = GEN_INT (val);
+
+    emit_insn (gen_msa_andi_b_insn (operands[0], operands[1],
+                gen_rtx_CONST_VECTOR (V16QImode, v)));
+    DONE;
+  })
+
+(define_insn "msa_addvi_<msafmt>_insn"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(plus:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "const_vector_same_uimm5_operand" "")))]
+  "ISA_HAS_MSA"
+  "addvi.<msafmt>\t%w0,%w1,%E2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_andi_b_insn"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(and:V16QI (match_operand:V16QI 1 "register_operand" "f")
+		   (match_operand:V16QI 2 "const_vector_same_uimm8_operand" "")))]
+  "ISA_HAS_MSA"
+  "andi.b\t%w0,%w1,%E2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_asub_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_ASUB_S))]
+  "ISA_HAS_MSA"
+  "asub_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_asub_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_ASUB_U))]
+  "ISA_HAS_MSA"
+  "asub_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_ave_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_AVE_S))]
+  "ISA_HAS_MSA"
+  "ave_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_ave_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_AVE_U))]
+  "ISA_HAS_MSA"
+  "ave_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_aver_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_AVER_S))]
+  "ISA_HAS_MSA"
+  "aver_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_aver_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_AVER_U))]
+  "ISA_HAS_MSA"
+  "aver_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_bclr_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_BCLR))]
+  "ISA_HAS_MSA"
+  "bclr.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_bclri_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_BCLRI))]
+  "ISA_HAS_MSA"
+  "bclri.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_binsl_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand:IMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_BINSL))]
+  "ISA_HAS_MSA"
+  "binsl.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_binsli_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand 3 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_BINSLI))]
+  "ISA_HAS_MSA"
+  "binsli.<msafmt>\t%w0,%w2,%3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_binsr_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand:IMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_BINSR))]
+  "ISA_HAS_MSA"
+  "binsr.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_binsri_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand 3 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_BINSRI))]
+  "ISA_HAS_MSA"
+  "binsri.<msafmt>\t%w0,%w2,%3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_bmnz_v_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand:IMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_BMNZ_V))]
+  "ISA_HAS_MSA"
+  "bmnz.v\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bmnzi_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
+		       (match_operand:V16QI 2 "register_operand" "f")
+		       (match_operand 3 "const_uimm8_operand" "")]
+		      UNSPEC_MSA_BMNZI_B))]
+  "ISA_HAS_MSA"
+  "bmnzi.b\t%w0,%w2,%3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bmz_v_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand:IMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_BMZ_V))]
+  "ISA_HAS_MSA"
+  "bmz.v\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bmzi_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
+		       (match_operand:V16QI 2 "register_operand" "f")
+		       (match_operand 3 "const_uimm8_operand" "")]
+		      UNSPEC_MSA_BMZI_B))]
+  "ISA_HAS_MSA"
+  "bmzi.b\t%w0,%w2,%3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bneg_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_BNEG))]
+  "ISA_HAS_MSA"
+  "bneg.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_bnegi_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		       (match_operand 2 "const_msa_branch_operand" "")]
+		     UNSPEC_MSA_BNEGI))]
+  "ISA_HAS_MSA"
+  "bnegi.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_bsel_v_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "0")
+		      (match_operand:IMSA 2 "register_operand" "f")
+		      (match_operand:IMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_BSEL_V))]
+  "ISA_HAS_MSA"
+  "bsel.v\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bseli_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
+		       (match_operand:V16QI 2 "register_operand" "f")
+		       (match_operand 3 "const_uimm8_operand" "")]
+		      UNSPEC_MSA_BSELI_B))]
+  "ISA_HAS_MSA"
+  "bseli.b\t%w0,%w2,%3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic_l")])
+
+(define_insn "msa_bset_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_BSET))]
+  "ISA_HAS_MSA"
+  "bset.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_bseti_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_BSETI))]
+  "ISA_HAS_MSA"
+  "bseti.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_code_iterator ICC [eq le leu lt ltu])
+
+(define_code_attr icc
+    [(eq  "eq")
+     (le  "le_s")
+     (leu "le_u")
+     (lt  "lt_s")
+     (ltu "lt_u")])
+
+(define_code_attr icci
+    [(eq  "eqi")
+     (le  "lei_s")
+     (leu "lei_u")
+     (lt  "lti_s")
+     (ltu "lti_u")])
+
+(define_code_attr cmpi
+    [(eq   "s")
+     (le   "s")
+     (leu  "u")
+     (lt   "s")
+     (ltu  "u")])
+
+(define_insn "msa_c<ICC:icc>_<IMSA:msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(ICC:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		  (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "c<ICC:icc>.<IMSA:msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_c<ICC:icci>i_<IMSA:msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(ICC:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		  (match_operand:IMSA 2 "const_vector_same_cmp<ICC:cmpi>imm4_operand" "")))]
+  "ISA_HAS_MSA"
+  "c<ICC:icci>.<IMSA:msafmt>\t%w0,%w1,%E2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_c<ICC:icci>_<IMSA:msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(ICC:IMSA (match_operand:IMSA 1 "register_operand" "f")
+				(match_operand 2 "const_imm5_operand" ""))]
+		     UNSPEC_MSA_CMPI))]
+  "ISA_HAS_MSA"
+  "c<ICC:icci>.<IMSA:msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_dotp_s_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:<VHALFMODE> 1 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")]
+			 UNSPEC_MSA_DOTP_S))]
+  "ISA_HAS_MSA"
+  "dotp_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_dotp_u_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:<VHALFMODE> 1 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")]
+			 UNSPEC_MSA_DOTP_U))]
+  "ISA_HAS_MSA"
+  "dotp_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_dpadd_s_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:IDOTP128 1 "register_operand" "0")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 3 "register_operand" "f")]
+			 UNSPEC_MSA_DPADD_S))]
+  "ISA_HAS_MSA"
+  "dpadd_s.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_dpadd_u_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:IDOTP128 1 "register_operand" "0")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 3 "register_operand" "f")]
+			 UNSPEC_MSA_DPADD_U))]
+  "ISA_HAS_MSA"
+  "dpadd_u.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_dpsub_s_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:IDOTP128 1 "register_operand" "0")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 3 "register_operand" "f")]
+			 UNSPEC_MSA_DPSUB_S))]
+  "ISA_HAS_MSA"
+  "dpsub_s.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_dpsub_u_<msafmt>"
+  [(set (match_operand:IDOTP128 0 "register_operand" "=f")
+	(unspec:IDOTP128 [(match_operand:IDOTP128 1 "register_operand" "0")
+			  (match_operand:<VHALFMODE> 2 "register_operand" "f")
+			  (match_operand:<VHALFMODE> 3 "register_operand" "f")]
+			 UNSPEC_MSA_DPSUB_U))]
+  "ISA_HAS_MSA"
+  "dpsub_u.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_fclass_<msafmt>"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(unspec:<VIMODE> [(match_operand:FMSA 1 "register_operand" "f")]
+			 UNSPEC_MSA_FCLASS))]
+  "ISA_HAS_MSA"
+  "fclass.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcmp")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2_l")])
+
+(define_insn "msa_fcaf_<msafmt>"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(unspec:<VIMODE> [(match_operand:FMSA 1 "register_operand" "f")
+			  (match_operand:FMSA 2 "register_operand" "f")]
+			 UNSPEC_MSA_FCAF))]
+  "ISA_HAS_MSA"
+  "fcaf.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcmp")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_cmp")])
+
+(define_insn "msa_fcune_<FMSA:msafmt>"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(unspec:<VIMODE> [(match_operand:FMSA 1 "register_operand" "f")
+			  (match_operand:FMSA 2 "register_operand" "f")]
+			 UNSPEC_MSA_FCUNE))]
+  "ISA_HAS_MSA"
+  "fcune.<FMSA:msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcmp")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_cmp")])
+
+(define_code_iterator FCC [unordered ordered eq ne le lt uneq unle unlt])
+
+(define_code_attr fcc
+    [(unordered "fcun")
+     (ordered   "fcor")
+     (eq        "fceq")
+     (ne        "fcne")
+     (uneq      "fcueq")
+     (unle      "fcule")
+     (unlt      "fcult")
+     (le        "fcle")
+     (lt        "fclt")])
+
+(define_int_iterator FSC_UNS [UNSPEC_MSA_FSAF UNSPEC_MSA_FSUN UNSPEC_MSA_FSOR
+			      UNSPEC_MSA_FSEQ UNSPEC_MSA_FSNE UNSPEC_MSA_FSUEQ
+			      UNSPEC_MSA_FSUNE UNSPEC_MSA_FSULE UNSPEC_MSA_FSULT
+			      UNSPEC_MSA_FSLE UNSPEC_MSA_FSLT])
+
+(define_int_attr fsc
+    [(UNSPEC_MSA_FSAF  "fsaf")
+     (UNSPEC_MSA_FSUN  "fsun")
+     (UNSPEC_MSA_FSOR  "fsor")
+     (UNSPEC_MSA_FSEQ  "fseq")
+     (UNSPEC_MSA_FSNE  "fsne")
+     (UNSPEC_MSA_FSUEQ "fsueq")
+     (UNSPEC_MSA_FSUNE "fsune")
+     (UNSPEC_MSA_FSULE "fsule")
+     (UNSPEC_MSA_FSULT "fsult")
+     (UNSPEC_MSA_FSLE  "fsle")
+     (UNSPEC_MSA_FSLT  "fslt")])
+
+(define_insn "msa_<FCC:fcc>_<FMSA:msafmt>"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(FCC:<VIMODE> (match_operand:FMSA 1 "register_operand" "f")
+		      (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "<FCC:fcc>.<FMSA:msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcmp")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_cmp")])
+
+(define_insn "msa_<fsc>_<FMSA:msafmt>"
+  [(set (match_operand:<VIMODE> 0 "register_operand" "=f")
+	(unspec:<VIMODE> [(match_operand:FMSA 1 "register_operand" "f")
+			   (match_operand:FMSA 2 "register_operand" "f")]
+			  FSC_UNS))]
+  "ISA_HAS_MSA"
+  "<fsc>.<FMSA:msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcmp")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_cmp")])
+
+(define_insn "msa_fexp2_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:FMSA 1 "register_operand" "f")
+		      (match_operand:<VIMODE> 2 "register_operand" "f")]
+		     UNSPEC_MSA_FEXP2))]
+  "ISA_HAS_MSA"
+  "fexp2.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fmul")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2")])
+
+(define_mode_attr FINT
+   [(V4SF "V4SI")
+    (V2DF "V2DI")])
+
+(define_mode_attr fint
+   [(V4SF "v4si")
+    (V2DF "v2di")])
+
+(define_mode_attr FQ
+   [(V4SF "V8HI")
+    (V2DF "V4SI")])
+
+(define_mode_attr FINTCNV
+  [(V4SF "I2S")
+   (V2DF "I2D")])
+
+(define_mode_attr FINTCNV_2
+  [(V4SF "S2I")
+   (V2DF "D2I")])
+
+(define_insn "float<fint><FMSA:mode>2"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(float:FMSA (match_operand:<FINT> 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "ffint_s.<msafmt>\t%w0,%w1"
+  [(set_attr "type"     "fcvt")
+   (set_attr "cnv_mode" "<FINTCNV>")
+   (set_attr "mode"     "<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "floatuns<fint><FMSA:mode>2"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unsigned_float:FMSA (match_operand:<FINT> 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "ffint_u.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_mode_attr FFQ
+  [(V4SF "V8HI")
+   (V2DF "V4SI")])
+
+(define_insn "msa_ffql_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:<FQ> 1 "register_operand" "f")]
+		     UNSPEC_MSA_FFQL))]
+  "ISA_HAS_MSA"
+  "ffql.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ffqr_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:<FQ> 1 "register_operand" "f")]
+		     UNSPEC_MSA_FFQR))]
+  "ISA_HAS_MSA"
+  "ffqr.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+;; Note used directly by builtins but via the following define_expand.
+(define_insn "msa_fill_<msafmt>_insn"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(vec_duplicate:IMSA
+	  (match_operand:<UNITMODE> 1 "reg_or_0_operand" "dJ")))]
+  "ISA_HAS_MSA"
+  "fill.<msafmt>\t%w0,%z1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+;; Expand builtin catoring for HImode and QImode which take SImode.
+(define_expand "msa_fill_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand")
+	(vec_duplicate:IMSA
+	  (match_operand:<EXCEPT> 1 "reg_or_0_operand")))]
+  "ISA_HAS_MSA"
+{
+  if ((GET_MODE_SIZE (<UNITMODE>mode) < GET_MODE_SIZE (<EXCEPT>mode))
+      && (REG_P (operands[1]) || (GET_CODE (operands[1]) == SUBREG
+				  && REG_P (SUBREG_REG (operands[1])))))
+    {
+      unsigned int offset = GET_MODE_SIZE (<EXCEPT>mode)
+			    - GET_MODE_SIZE (<UNITMODE>mode);
+      operands[1] = simplify_gen_subreg (<UNITMODE>mode, operands[1],
+					 GET_MODE (operands[1]),
+					 BYTES_BIG_ENDIAN ? offset : 0);
+    }
+  emit_insn (gen_msa_fill_<msafmt>_insn (operands[0], operands[1]));
+  DONE;
+ })
+
+(define_insn "msa_fill_<msafmt_f>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(vec_duplicate:FMSA
+	  (match_operand:<UNITMODE> 1 "reg_or_0_operand" "dJ")))]
+  "ISA_HAS_MSA"
+  "fill.<msafmt>\t%w0,%z1"
+  [(set_attr "type"     "arith")
+   (set_attr "mode"     "TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+;; Note that fill.d and fill.d_f will be split later if !TARGET_64BIT.
+(define_split
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_duplicate:V2DI
+	  (match_operand:DI 1 "reg_or_0_operand")))]
+  "reload_completed && TARGET_MSA && !TARGET_64BIT"
+  [(const_int 0)]
+{
+  mips_split_msa_fill_d (operands[0], operands[1]);
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_duplicate:V2DF
+	  (match_operand:DF 1 "register_operand")))]
+  "reload_completed && TARGET_MSA && !TARGET_64BIT"
+  [(const_int 0)]
+{
+  mips_split_msa_fill_d (operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn "msa_flog2_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:FMSA 1 "register_operand" "f")]
+		     UNSPEC_MSA_FLOG2))]
+  "ISA_HAS_MSA"
+  "flog2.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fmul")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2_l")])
+
+;;UNSPEC_MSA_FMAX
+(define_insn "smax<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(smax:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fmax.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2")])
+
+;;UNSPEC_MSA_FMAX_A
+(define_insn "umax<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(umax:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fmax_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2")])
+
+;;UNSPEC_MSA_FMIN
+(define_insn "smin<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(smin:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fmin.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2")])
+
+;;UNSPEC_MSA_FMIN_A
+(define_insn "umin<mode>3"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(umin:FMSA (match_operand:FMSA 1 "register_operand" "f")
+		   (match_operand:FMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "fmin_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"fadd")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float2")])
+
+(define_insn "msa_frcp_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:FMSA 1 "register_operand" "f")]
+			 UNSPEC_MSA_FRCP))]
+  "ISA_HAS_MSA"
+  "frcp.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"frdiv")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_fdiv")])
+
+(define_insn "msa_frint_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:FMSA 1 "register_operand" "f")]
+			 UNSPEC_MSA_FRINT))]
+  "ISA_HAS_MSA"
+  "frint.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fmul")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_frsqrt_<msafmt>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:FMSA 1 "register_operand" "f")]
+		     UNSPEC_MSA_FRSQRT))]
+  "ISA_HAS_MSA"
+  "frsqrt.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"frsqrt")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_fdiv")])
+
+(define_insn "msa_ftint_s_<msafmt>"
+  [(set (match_operand:<FINT> 0 "register_operand" "=f")
+	(unspec:<FINT> [(match_operand:FMSA 1 "register_operand" "f")]
+		       UNSPEC_MSA_FTINT_S))]
+  "ISA_HAS_MSA"
+  "ftint_s.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV_2>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ftint_u_<msafmt>"
+  [(set (match_operand:<FINT> 0 "register_operand" "=f")
+	(unspec:<FINT> [(match_operand:FMSA 1 "register_operand" "f")]
+		       UNSPEC_MSA_FTINT_U))]
+  "ISA_HAS_MSA"
+  "ftint_u.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV_2>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ftrunc_s_<msafmt>"
+  [(set (match_operand:<FINT> 0 "register_operand" "=f")
+	(unspec:<FINT> [(match_operand:FMSA 1 "register_operand" "f")]
+		       UNSPEC_MSA_FTRUNC_S))]
+  "ISA_HAS_MSA"
+  "ftrunc_s.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV_2>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ftrunc_u_<msafmt>"
+  [(set (match_operand:<FINT> 0 "register_operand" "=f")
+	(unspec:<FINT> [(match_operand:FMSA 1 "register_operand" "f")]
+		       UNSPEC_MSA_FTRUNC_U))]
+  "ISA_HAS_MSA"
+  "ftrunc_u.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"<FINTCNV_2>")
+   (set_attr "mode"	"<UNITMODE>")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ftq_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(unspec:V8HI [(match_operand:V4SF 1 "register_operand" "f")
+		      (match_operand:V4SF 2 "register_operand" "f")]
+		     UNSPEC_MSA_FTQ))]
+  "ISA_HAS_MSA"
+  "ftq.h\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"S2I")
+   (set_attr "mode"	"SF")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_ftq_w"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+	(unspec:V4SI [(match_operand:V2DF 1 "register_operand" "f")
+		      (match_operand:V2DF 2 "register_operand" "f")]
+		     UNSPEC_MSA_FTQ))]
+  "ISA_HAS_MSA"
+  "ftq.w\t%w0,%w1,%w2"
+  [(set_attr "type"	"fcvt")
+   (set_attr "cnv_mode"	"D2I")
+   (set_attr "mode"	"DF")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_mode_iterator IZMODE [V8HI V4SI V2DI])
+(define_mode_attr IZDOUBLE
+  [(V8HI  "V16QI")
+   (V4SI  "V8HI")
+   (V2DI  "V4SI")])
+
+(define_insn "msa_hadd_s_<msafmt>"
+  [(set (match_operand:IZMODE 0 "register_operand" "=f")
+	(unspec:IZMODE [(match_operand:<IZDOUBLE> 1 "register_operand" "f")
+			(match_operand:<IZDOUBLE> 2 "register_operand" "f")]
+		       UNSPEC_MSA_HADD_S))]
+  "ISA_HAS_MSA"
+  "hadd_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_hadd_u_<msafmt>"
+  [(set (match_operand:IZMODE 0 "register_operand" "=f")
+	(unspec:IZMODE [(match_operand:<IZDOUBLE> 1 "register_operand" "f")
+			(match_operand:<IZDOUBLE> 2 "register_operand" "f")]
+		       UNSPEC_MSA_HADD_U))]
+  "ISA_HAS_MSA"
+  "hadd_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_hsub_s_<msafmt>"
+  [(set (match_operand:IZMODE 0 "register_operand" "=f")
+	(unspec:IZMODE [(match_operand:<IZDOUBLE> 1 "register_operand" "f")
+			(match_operand:<IZDOUBLE> 2 "register_operand" "f")]
+		       UNSPEC_MSA_HSUB_S))]
+  "ISA_HAS_MSA"
+  "hsub_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_hsub_u_<msafmt>"
+  [(set (match_operand:IZMODE 0 "register_operand" "=f")
+	(unspec:IZMODE [(match_operand:<IZDOUBLE> 1 "register_operand" "f")
+			(match_operand:<IZDOUBLE> 2 "register_operand" "f")]
+		       UNSPEC_MSA_HSUB_U))]
+  "ISA_HAS_MSA"
+  "hsub_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_ilvev_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(vec_select:V16QI (vec_concat:V32QI
+				(match_operand:V16QI 1 "register_operand" "f")
+				(match_operand:V16QI 2 "register_operand" "f"))
+			  (parallel [(const_int 16) (const_int 0)
+				     (const_int 18) (const_int 2)
+				     (const_int 20) (const_int 4)
+				     (const_int 22) (const_int 6)
+				     (const_int 24) (const_int 8)
+				     (const_int 26) (const_int 10)
+				     (const_int 28) (const_int 12)
+				     (const_int 30) (const_int 14)])))]
+  "ISA_HAS_MSA"
+  "ilvev.b\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvev_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(vec_select:V8HI (vec_concat:V16HI
+				(match_operand:V8HI 1 "register_operand" "f")
+				(match_operand:V8HI 2 "register_operand" "f"))
+			 (parallel [(const_int 8)  (const_int 0)
+				    (const_int 10) (const_int 2)
+				    (const_int 12) (const_int 4)
+				    (const_int 14) (const_int 6)])))]
+  "ISA_HAS_MSA"
+  "ilvev.h\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvev_w"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+	(vec_select:V4SI (vec_concat:V8SI
+				(match_operand:V4SI 1 "register_operand" "f")
+				(match_operand:V4SI 2 "register_operand" "f"))
+			 (parallel [(const_int 4) (const_int 0)
+				    (const_int 6) (const_int 2)])))]
+  "ISA_HAS_MSA"
+  "ilvev.w\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvl_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(vec_select:V16QI (vec_concat:V32QI
+				(match_operand:V16QI 1 "register_operand" "f")
+				(match_operand:V16QI 2 "register_operand" "f"))
+			  (parallel [(const_int 24) (const_int 8)
+				     (const_int 25) (const_int 9)
+				     (const_int 26) (const_int 10)
+				     (const_int 27) (const_int 11)
+				     (const_int 28) (const_int 12)
+				     (const_int 29) (const_int 13)
+				     (const_int 30) (const_int 14)
+				     (const_int 31) (const_int 15)])))]
+  "ISA_HAS_MSA"
+  "ilvl.b\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvl_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(vec_select:V8HI (vec_concat:V16HI
+				(match_operand:V8HI 1 "register_operand" "f")
+				(match_operand:V8HI 2 "register_operand" "f"))
+			 (parallel [(const_int 12) (const_int 4)
+				    (const_int 13) (const_int 5)
+				    (const_int 14) (const_int 6)
+				    (const_int 15) (const_int 7)])))]
+  "ISA_HAS_MSA"
+  "ilvl.h\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvl_w"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+	(vec_select:V4SI (vec_concat:V8SI
+				(match_operand:V4SI 1 "register_operand" "f")
+				(match_operand:V4SI 2 "register_operand" "f"))
+			 (parallel [(const_int 6) (const_int 2)
+				    (const_int 7) (const_int 3)])))]
+  "ISA_HAS_MSA"
+  "ilvl.w\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvl_d"
+  [(set (match_operand:V2DI 0 "register_operand" "=f")
+	(vec_select:V2DI (vec_concat:V4DI
+				(match_operand:V2DI 1 "register_operand" "f")
+				(match_operand:V2DI 2 "register_operand" "f"))
+			 (parallel [(const_int 3) (const_int 1)])))]
+  "ISA_HAS_MSA"
+  "ilvl.d\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvod_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(vec_select:V16QI (vec_concat:V32QI
+				(match_operand:V16QI 1 "register_operand" "f")
+				(match_operand:V16QI 2 "register_operand" "f"))
+			  (parallel [(const_int 17) (const_int 1)
+				     (const_int 19) (const_int 3)
+				     (const_int 21) (const_int 5)
+				     (const_int 23) (const_int 7)
+				     (const_int 25) (const_int 9)
+				     (const_int 27) (const_int 11)
+				     (const_int 29) (const_int 13)
+				     (const_int 31) (const_int 15)])))]
+  "ISA_HAS_MSA"
+  "ilvod.b\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvod_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(vec_select:V8HI (vec_concat:V16HI
+				(match_operand:V8HI 1 "register_operand" "f")
+				(match_operand:V8HI 2 "register_operand" "f"))
+			 (parallel [(const_int 9)  (const_int 1)
+				    (const_int 11) (const_int 3)
+				    (const_int 13) (const_int 5)
+				    (const_int 15) (const_int 7)])))]
+  "ISA_HAS_MSA"
+  "ilvod.h\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvod_w"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+	(vec_select:V4SI (vec_concat:V8SI
+				(match_operand:V4SI 1 "register_operand" "f")
+				(match_operand:V4SI 2 "register_operand" "f"))
+			 (parallel [(const_int 5) (const_int 1)
+				    (const_int 7) (const_int 3)])))]
+  "ISA_HAS_MSA"
+  "ilvod.w\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvr_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(vec_select:V16QI (vec_concat:V32QI
+				(match_operand:V16QI 1 "register_operand" "f")
+				(match_operand:V16QI 2 "register_operand" "f"))
+			  (parallel [(const_int 16) (const_int 0)
+				     (const_int 17) (const_int 1)
+				     (const_int 18) (const_int 2)
+				     (const_int 19) (const_int 3)
+				     (const_int 20) (const_int 4)
+				     (const_int 21) (const_int 5)
+				     (const_int 22) (const_int 6)
+				     (const_int 23) (const_int 7)])))]
+  "ISA_HAS_MSA"
+  "ilvr.b\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvr_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(vec_select:V8HI (vec_concat:V16HI
+				(match_operand:V8HI 1 "register_operand" "f")
+				(match_operand:V8HI 2 "register_operand" "f"))
+			 (parallel [(const_int 8)  (const_int 0)
+				    (const_int 9)  (const_int 1)
+				    (const_int 10) (const_int 2)
+				    (const_int 11) (const_int 3)])))]
+  "ISA_HAS_MSA"
+  "ilvr.h\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvr_w"
+  [(set (match_operand:V4SI 0 "register_operand" "=f")
+	(vec_select:V4SI (vec_concat:V8SI
+				(match_operand:V4SI 1 "register_operand" "f")
+				(match_operand:V4SI 2 "register_operand" "f"))
+			 (parallel [(const_int 4) (const_int 0)
+				    (const_int 5) (const_int 1)])))]
+  "ISA_HAS_MSA"
+  "ilvr.w\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ilvr_d"
+  [(set (match_operand:V2DI 0 "register_operand" "=f")
+	(vec_select:V2DI (vec_concat:V4DI
+				(match_operand:V2DI 1 "register_operand" "f")
+				(match_operand:V2DI 2 "register_operand" "f"))
+			 (parallel [(const_int 2) (const_int 0)])))]
+  "ISA_HAS_MSA"
+  "ilvr.d\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_madd_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "0")
+			  (match_operand:QMSA 2 "register_operand" "f")
+			  (match_operand:QMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_MADD_Q))]
+  "ISA_HAS_MSA"
+  "madd_q.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_maddr_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "0")
+			  (match_operand:QMSA 2 "register_operand" "f")
+			  (match_operand:QMSA 3 "register_operand" "f")]
+		      UNSPEC_MSA_MADDR_Q))]
+  "ISA_HAS_MSA"
+  "maddr_q.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_max_a_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_MAX_A))]
+  "ISA_HAS_MSA"
+  "max_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "smax<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(smax:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "max_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "umax<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(umax:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "max_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_maxi_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_imm5_operand" "")]
+		     UNSPEC_MSA_MAXI_S))]
+  "ISA_HAS_MSA"
+  "maxi_s.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_maxi_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_uimm5_operand" "")]
+		      UNSPEC_MSA_MAXI_U))]
+  "ISA_HAS_MSA"
+  "maxi_u.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_min_a_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_MIN_A))]
+  "ISA_HAS_MSA"
+  "min_a.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "smin<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(smin:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "min_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "umin<mode>3"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(umin:IMSA (match_operand:IMSA 1 "register_operand" "f")
+		   (match_operand:IMSA 2 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "min_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_mini_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_imm5_operand" "")]
+		     UNSPEC_MSA_MINI_S))]
+  "ISA_HAS_MSA"
+  "mini_s.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_mini_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_uimm5_operand" "")]
+		     UNSPEC_MSA_MINI_U))]
+  "ISA_HAS_MSA"
+  "mini_u.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_msub_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "0")
+		       (match_operand:QMSA 2 "register_operand" "f")
+		       (match_operand:QMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_MSUB_Q))]
+  "ISA_HAS_MSA"
+  "msub_q.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_msubr_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "0")
+		       (match_operand:QMSA 2 "register_operand" "f")
+		       (match_operand:QMSA 3 "register_operand" "f")]
+		     UNSPEC_MSA_MSUBR_Q))]
+  "ISA_HAS_MSA"
+  "msubr_q.<msafmt>\t%w0,%w2,%w3"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_mul_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "f")
+		      (match_operand:QMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_MUL_Q))]
+  "ISA_HAS_MSA"
+  "mul_q.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_mulr_q_<msafmt>"
+  [(set (match_operand:QMSA 0 "register_operand" "=f")
+	(unspec:QMSA [(match_operand:QMSA 1 "register_operand" "f")
+		      (match_operand:QMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_MULR_Q))]
+  "ISA_HAS_MSA"
+  "mulr_q.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_mult")])
+
+(define_insn "msa_nloc_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")]
+		     UNSPEC_MSA_NLOC))]
+  "ISA_HAS_MSA"
+  "nloc.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "clz<mode>2"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(clz:IMSA (match_operand:IMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "nlzc.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_nor_v_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(and:IMSA (not:IMSA (match_operand:IMSA 1 "register_operand" "f"))
+		  (not:IMSA (match_operand:IMSA 2 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "nor.v\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"nor")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_nori_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "f")
+		       (match_operand 2 "const_uimm8_operand" "")]
+		      UNSPEC_MSA_NORI_B))]
+  "ISA_HAS_MSA"
+  "nori.b\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_ori_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(ior:V16QI (match_operand:V16QI 1 "register_operand" "f")
+		   (match_operand 2 "const_uimm8_operand" "")))]
+  "ISA_HAS_MSA"
+  "ori.b\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_pckev_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_PCKEV))]
+  "ISA_HAS_MSA"
+  "pckev.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_pckod_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+			(match_operand:IMSA 2 "register_operand" "f")]
+		       UNSPEC_MSA_PCKOD))]
+  "ISA_HAS_MSA"
+  "pckod.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "alu_type"	"add")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "popcount<mode>2"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(popcount:IMSA (match_operand:IMSA 1 "register_operand" "f")))]
+  "ISA_HAS_MSA"
+  "pcnt.<msafmt>\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic3")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_sat_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SAT_S))]
+  "ISA_HAS_MSA"
+  "sat_s.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic3")])
+
+(define_insn "msa_sat_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SAT_U))]
+  "ISA_HAS_MSA"
+  "sat_u.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic3")])
+
+(define_insn "msa_shf_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_uimm8_operand" "")]
+		     UNSPEC_MSA_SHF))]
+  "ISA_HAS_MSA"
+  "shf.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "datafmt" "<msafmt>")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_shf_w_f"
+  [(set (match_operand:V4SF 0 "register_operand" "=f")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "f")
+		      (match_operand 2 "const_uimm8_operand" "")]
+		     UNSPEC_MSA_SHF))]
+  "ISA_HAS_MSA"
+  "shf.w\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_slli_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SLLI))]
+  "ISA_HAS_MSA"
+  "slli.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srai_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA[(match_operand:IMSA 1 "register_operand" "f")
+		     (match_operand 2 "const_<bitimm>_operand" "")]
+		    UNSPEC_MSA_SRAI))]
+  "ISA_HAS_MSA"
+  "srai.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srar_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SRAR))]
+  "ISA_HAS_MSA"
+  "srar.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srari_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SRARI))]
+  "ISA_HAS_MSA"
+  "srari.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srli_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SRLI))]
+  "ISA_HAS_MSA"
+  "srli.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srlr_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SRLR))]
+  "ISA_HAS_MSA"
+  "srlr.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_srlri_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_<bitimm>_operand" "")]
+		     UNSPEC_MSA_SRLRI))]
+  "ISA_HAS_MSA"
+  "srlri.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic2")
+		      (const_string "msa_eu_logic")))])
+
+(define_insn "msa_subs_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SUBS_S))]
+  "ISA_HAS_MSA"
+  "subs_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_subs_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SUBS_U))]
+  "ISA_HAS_MSA"
+  "subs_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_subsuu_s_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SUBSUU_S))]
+  "ISA_HAS_MSA"
+  "subsuu_s.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_subsus_u_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand:IMSA 2 "register_operand" "f")]
+		     UNSPEC_MSA_SUBSUS_U))]
+  "ISA_HAS_MSA"
+  "subsus_u.<msafmt>\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_subvi_<msafmt>"
+  [(set (match_operand:IMSA 0 "register_operand" "=f")
+	(unspec:IMSA [(match_operand:IMSA 1 "register_operand" "f")
+		      (match_operand 2 "const_uimm5_operand" "")]
+		     UNSPEC_MSA_SUBVI))]
+  "ISA_HAS_MSA"
+  "subvi.<msafmt>\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit"	"msa_eu_int_add")])
+
+(define_insn "msa_xori_b"
+  [(set (match_operand:V16QI 0 "register_operand" "=f")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "f")
+		       (match_operand 2 "const_uimm8_operand" "")]
+		      UNSPEC_MSA_XORI_B))]
+  "ISA_HAS_MSA"
+  "xori.b\t%w0,%w1,%2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_sld_<msafmt_f>"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+	(unspec:MSA [(match_operand:MSA 1 "register_operand" "0")
+		     (match_operand:MSA 2 "register_operand" "f")
+		     (match_operand:SI 3 "reg_or_0_operand" "dJ")]
+		    UNSPEC_MSA_SLD))]
+  "ISA_HAS_MSA"
+  "sld.<msafmt>\t%w0,%w2[%z3]"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_sldi_<msafmt_f>"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+	(unspec:MSA [(match_operand:MSA 1 "register_operand" "0")
+		     (match_operand:MSA 2 "register_operand" "f")
+		     (match_operand 3 "const_<indeximm>_operand" "")]
+		    UNSPEC_MSA_SLDI))]
+  "ISA_HAS_MSA"
+  "sldi.<msafmt>\t%w0,%w2[%3]"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set (attr "msa_execunit")
+	(if_then_else (eq_attr "cpu" "i6400")
+		      (const_string "msa_eu_logic_l2")
+		      (const_string "msa_eu_logic_l")))])
+
+(define_insn "msa_splat_<msafmt_f>"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+	(unspec:MSA [(match_operand:MSA 1 "register_operand" "f")
+		     (match_operand:SI 2 "reg_or_0_operand" "dJ")]
+		    UNSPEC_MSA_SPLAT))]
+  "ISA_HAS_MSA"
+  "splat.<msafmt>\t%w0,%w1[%z2]"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_splati_<msafmt_f>"
+  [(set (match_operand:MSA 0 "register_operand" "=f")
+	(unspec:MSA [(match_operand:MSA 1 "register_operand" "f")
+		     (match_operand 2 "const_<indeximm>_operand" "")]
+		    UNSPEC_MSA_SPLATI))]
+  "ISA_HAS_MSA"
+  "splati.<msafmt>\t%w0,%w1[%2]"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+;; operand 1 is a scalar
+(define_insn "msa_splati_<msafmt_f>_s"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:<UNITMODE> 1 "register_operand" "f")
+		      (match_operand 2 "const_<indeximm>_operand" "")]
+		     UNSPEC_MSA_SPLATI))]
+  "ISA_HAS_MSA"
+  "splati.<msafmt>\t%w0,%w1[%2]"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_logic")])
+
+(define_insn "msa_cfcmsa"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(unspec_volatile:SI [(match_operand 1 "const_uimm5_operand" "")]
+			    UNSPEC_MSA_CFCMSA))]
+  "ISA_HAS_MSA"
+  "cfcmsa\t%0,$%1"
+  [(set_attr "type"	"mfc")
+   (set_attr "mode"	"SI")
+   (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_insn "msa_ctcmsa"
+  [(unspec_volatile [(match_operand 0 "const_uimm5_operand" "")
+		     (match_operand:SI 1 "register_operand" "d")]
+		    UNSPEC_MSA_CTCMSA)]
+  "ISA_HAS_MSA"
+  "ctcmsa\t$%0,%1"
+  [(set_attr "type"	"mtc")
+   (set_attr "mode"	"SI")
+   (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_insn "msa_fexdo_h"
+  [(set (match_operand:V8HI 0 "register_operand" "=f")
+	(unspec:V8HI [(match_operand:V4SF 1 "register_operand" "f")
+		      (match_operand:V4SF 2 "register_operand" "f")]
+		     UNSPEC_MSA_FEXDO))]
+  "ISA_HAS_MSA"
+  "fexdo.h\t%w0,%w1,%w2"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_fexdo_w"
+  [(set (match_operand:V4SF 0 "register_operand" "=f")
+	(vec_concat:V4SF
+	  (float_truncate:V2SF (match_operand:V2DF 1 "register_operand" "f"))
+	  (float_truncate:V2SF (match_operand:V2DF 2 "register_operand" "f"))))]
+  "ISA_HAS_MSA"
+  "fexdo.w\t%w0,%w2,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_fexupl_w"
+  [(set (match_operand:V4SF 0 "register_operand" "=f")
+	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "f")]
+		     UNSPEC_MSA_FEXUPL))]
+  "ISA_HAS_MSA"
+  "fexupl.w\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_fexupl_d"
+  [(set (match_operand:V2DF 0 "register_operand" "=f")
+	(unspec:V2DF [(match_operand:V4SF 1 "register_operand" "f")]
+		     UNSPEC_MSA_FEXUPL))]
+  "ISA_HAS_MSA"
+  "fexupl.d\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_fexupr_w"
+  [(set (match_operand:V4SF 0 "register_operand" "=f")
+       (unspec:V4SF [(match_operand:V8HI 1 "register_operand" "f")]
+		    UNSPEC_MSA_FEXUPR))]
+  "ISA_HAS_MSA"
+  "fexupr.w\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_fexupr_d"
+  [(set (match_operand:V2DF 0 "register_operand" "=f")
+       (unspec:V2DF [(match_operand:V4SF 1 "register_operand" "f")]
+		    UNSPEC_MSA_FEXUPR))]
+  "ISA_HAS_MSA"
+  "fexupr.d\t%w0,%w1"
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")
+   (set_attr "msa_execunit" "msa_eu_float4")])
+
+(define_insn "msa_branch_nz_v_<msafmt_f>"
+ [(set (pc) (if_then_else
+	      (ne (unspec:SI [(match_operand:MSA 1 "register_operand" "f")]
+			     UNSPEC_MSA_BNZ_V)
+		  (match_operand:SI 2 "const_0_operand"))
+		  (label_ref (match_operand 0))
+		  (pc)))]
+ "ISA_HAS_MSA"
+ {
+   return mips_output_conditional_branch (insn, operands,
+					  MIPS_BRANCH ("bnz.v", "%w1,%0"),
+					  MIPS_BRANCH ("bz.v", "%w1,%0"));
+ }
+ [(set_attr "type"	"branch")
+  (set_attr "mode"	"TI")
+  (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_expand "msa_bnz_v_<msafmt_f>"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(unspec:SI [(match_operand:MSA 1 "register_operand" "f")]
+		   UNSPEC_MSA_TSTNZ_V))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_msa_branch (operands, gen_msa_branch_nz_v_<MSA:msafmt_f>);
+    DONE;
+  })
+
+(define_insn "msa_branchz_v_<msafmt_f>"
+ [(set (pc) (if_then_else
+	      (eq (unspec:SI [(match_operand:MSA 1 "register_operand" "f")]
+			     UNSPEC_MSA_BZ_V)
+		  (match_operand:SI 2 "const_0_operand"))
+		  (label_ref (match_operand 0))
+		  (pc)))]
+ "ISA_HAS_MSA"
+ {
+   return mips_output_conditional_branch (insn, operands,
+					  MIPS_BRANCH ("bz.v", "%w1,%0"),
+					  MIPS_BRANCH ("bnz.v", "%w1,%0"));
+ }
+ [(set_attr "type"	"branch")
+  (set_attr "mode"	"TI")
+  (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_expand "msa_bz_v_<msafmt_f>"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(unspec:SI [(match_operand:MSA 1 "register_operand" "f")]
+		   UNSPEC_MSA_TSTZ_V))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_msa_branch (operands, gen_msa_branchz_v_<MSA:msafmt_f>);
+    DONE;
+  })
+
+(define_insn "msa_branchnz_<msafmt_f>"
+ [(set (pc) (if_then_else
+	      (ne (unspec:SI [(match_operand:MSA 1 "register_operand" "f")]
+			     UNSPEC_MSA_BNZ)
+		  (match_operand:SI 2 "const_0_operand"))
+		  (label_ref (match_operand 0))
+		  (pc)))]
+ "ISA_HAS_MSA"
+ {
+   return mips_output_conditional_branch (insn, operands,
+					  MIPS_BRANCH ("bnz.<msafmt>", "%w1,%0"),
+					  MIPS_BRANCH ("bz.<msafmt>", "%w1,%0"));
+
+ }
+
+ [(set_attr "type"	"branch")
+  (set_attr "mode"	"TI")
+  (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_expand "msa_bnz_<msafmt>"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(unspec:SI [(match_operand:IMSA 1 "register_operand" "f")]
+		   UNSPEC_MSA_TSTNZ))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_msa_branch (operands, gen_msa_branchnz_<IMSA:msafmt>);
+    DONE;
+  })
+
+(define_insn "msa_branchz_<msafmt>"
+ [(set (pc) (if_then_else
+	      (eq (unspec:SI [(match_operand:IMSA 1 "register_operand" "f")]
+			      UNSPEC_MSA_BZ)
+		   (match_operand:IMSA 2 "const_0_operand"))
+		  (label_ref (match_operand 0))
+		  (pc)))]
+ "ISA_HAS_MSA"
+ {
+   return mips_output_conditional_branch (insn, operands,
+					  MIPS_BRANCH ("bz.<msafmt>", "%w1,%0"),
+					  MIPS_BRANCH ("bnz.<msafmt>","%w1,%0"));
+ }
+ [(set_attr "type"	"arith")
+  (set_attr "mode"	"TI")
+  (set_attr "msa_execunit" "msa_eu_store4")])
+
+(define_expand "msa_bz_<msafmt>"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(unspec:SI [(match_operand:IMSA 1 "register_operand" "f")]
+		   UNSPEC_MSA_TSTZ))]
+  "ISA_HAS_MSA"
+  {
+    mips_expand_msa_branch (operands, gen_msa_branchz_<IMSA:msafmt>);
+    DONE;
+  })
+
+;; Note that this instruction treats scalar as vector registers freely.
+(define_insn "msa_cast_to_vector_<msafmt_f>"
+  [(set (match_operand:FMSA 0 "register_operand" "=f")
+	(unspec:FMSA [(match_operand:<UNITMODE> 1 "register_operand" "f")]
+		     UNSPEC_MSA_CAST_TO_VECTOR))]
+  "ISA_HAS_MSA"
+{
+  if (REGNO (operands[0]) == REGNO (operands[1]))
+    return "nop\t# Cast %1 to %w0";
+  else
+    return "mov.<unitfmt>\t%0,%1\t# Cast %1 to %w0";
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")])
+
+;; Note that this instruction treats vector as scalar registers freely.
+(define_insn "msa_cast_to_scalar_<msafmt_f>"
+  [(set (match_operand:<UNITMODE> 0 "register_operand" "=f")
+	(unspec:<UNITMODE> [(match_operand:FMSA 1 "register_operand" "f")]
+			   UNSPEC_MSA_CAST_TO_SCALAR))]
+  "ISA_HAS_MSA"
+{
+  if (REGNO (operands[0]) == REGNO (operands[1]))
+    return "nop\t# Cast %w1 to %0";
+  else
+    return "mov.<unitfmt>\t%0,%1\t# Cast %w1 to %0";
+}
+  [(set_attr "type"	"arith")
+   (set_attr "mode"	"TI")])
diff --git a/gcc/config/mips/mips-opts.h b/gcc/config/mips/mips-opts.h
index be288d64c0c..9d0de49211f 100644
--- a/gcc/config/mips/mips-opts.h
+++ b/gcc/config/mips/mips-opts.h
@@ -47,4 +47,10 @@ enum mips_r10k_cache_barrier_setting {
 #define MIPS_ARCH_OPTION_FROM_ABI -1
 #define MIPS_ARCH_OPTION_NATIVE -2
 
+/* Enumerates the setting of the -mcompact-branches= option.  */
+enum mips_cb_setting {
+  MIPS_CB_NEVER,
+  MIPS_CB_OPTIMAL,
+  MIPS_CB_ALWAYS
+};
 #endif
diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h
index 3d59b7b51e0..6ce3d70e6e9 100644
--- a/gcc/config/mips/mips-protos.h
+++ b/gcc/config/mips/mips-protos.h
@@ -192,47 +192,54 @@ enum mips_split_type {
 
 extern bool mips_symbolic_constant_p (rtx, enum mips_symbol_context,
 				      enum mips_symbol_type *);
-extern int mips_regno_mode_ok_for_base_p (int, enum machine_mode, bool);
-extern bool mips_stack_address_p (rtx, enum machine_mode);
-extern int mips_address_insns (rtx, enum machine_mode, bool);
+extern int mips_regno_mode_ok_for_base_p (int, machine_mode, bool);
+extern bool mips_stack_address_p (rtx, machine_mode);
+extern int mips_address_insns (rtx, machine_mode, bool);
 extern int mips_const_insns (rtx);
 extern int mips_split_const_insns (rtx);
+extern int mips_split_128bit_const_insns (rtx);
 extern int mips_load_store_insns (rtx, rtx);
 extern int mips_idiv_insns (void);
+extern int mips_msa_idiv_insns (void);
 extern rtx mips_emit_move (rtx, rtx);
 #ifdef RTX_CODE
 extern void mips_emit_binary (enum rtx_code, rtx, rtx, rtx);
 #endif
 extern rtx mips_pic_base_register (rtx);
 extern rtx mips_got_load (rtx, rtx, enum mips_symbol_type);
-extern bool mips_split_symbol (rtx, rtx, enum machine_mode, rtx *);
+extern bool mips_split_symbol (rtx, rtx, machine_mode, rtx *);
 extern rtx mips_unspec_address (rtx, enum mips_symbol_type);
 extern rtx mips_strip_unspec_address (rtx);
 extern void mips_move_integer (rtx, rtx, unsigned HOST_WIDE_INT);
-extern bool mips_legitimize_move (enum machine_mode, rtx, rtx);
+extern bool mips_legitimize_move (machine_mode, rtx, rtx);
 
 extern rtx mips_subword (rtx, bool);
 extern bool mips_split_move_p (rtx, rtx, enum mips_split_type);
 extern void mips_split_move (rtx, rtx, enum mips_split_type);
 extern bool mips_split_move_insn_p (rtx, rtx, rtx);
 extern void mips_split_move_insn (rtx, rtx, rtx);
+extern void mips_split_128bit_move (rtx, rtx);
+extern bool mips_split_128bit_move_p (rtx, rtx);
+extern void mips_split_msa_copy_d (rtx, rtx, rtx, rtx (*)(rtx, rtx, rtx));
+extern void mips_split_msa_insert_d (rtx, rtx, rtx, rtx);
+extern void mips_split_msa_fill_d (rtx, rtx);
 extern const char *mips_output_move (rtx, rtx);
 extern bool mips_cfun_has_cprestore_slot_p (void);
 extern bool mips_cprestore_address_p (rtx, bool);
 extern void mips_save_gp_to_cprestore_slot (rtx, rtx, rtx, rtx);
 extern void mips_restore_gp_from_cprestore_slot (rtx);
-#ifdef RTX_CODE
 extern void mips_expand_scc (rtx *);
 extern void mips_expand_conditional_branch (rtx *);
+#ifdef RTX_CODE
 extern void mips_expand_vcondv2sf (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
+#endif
 extern void mips_expand_conditional_move (rtx *);
 extern void mips_expand_conditional_trap (rtx);
-#endif
+extern void mips_expand_msa_branch (rtx *operands, rtx (*gen_fn)(rtx, rtx, rtx));
 extern bool mips_use_pic_fn_addr_reg_p (const_rtx);
 extern rtx mips_expand_call (enum mips_call_type, rtx, rtx, rtx, rtx, bool);
 extern void mips_split_call (rtx, rtx);
 extern bool mips_get_pic_call_symbol (rtx *, int);
-extern void mips_expand_fcc_reload (rtx, rtx, rtx);
 extern void mips_set_return_address (rtx, rtx);
 extern bool mips_move_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int);
 extern bool mips_store_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int);
@@ -240,15 +247,14 @@ extern bool mips_expand_block_move (rtx, rtx, rtx);
 extern void mips_expand_synci_loop (rtx, rtx);
 
 extern void mips_init_cumulative_args (CUMULATIVE_ARGS *, tree);
-extern bool mips_pad_arg_upward (enum machine_mode, const_tree);
-extern bool mips_pad_reg_upward (enum machine_mode, tree);
+extern bool mips_pad_arg_upward (machine_mode, const_tree);
+extern bool mips_pad_reg_upward (machine_mode, tree);
 
 extern bool mips_expand_ext_as_unaligned_load (rtx, rtx, HOST_WIDE_INT,
 					       HOST_WIDE_INT, bool);
 extern bool mips_expand_ins_as_unaligned_store (rtx, rtx, HOST_WIDE_INT,
 						HOST_WIDE_INT);
-extern bool mips_mem_fits_mode_p (enum machine_mode mode, rtx x);
-extern void mips_order_regs_for_local_alloc (void);
+extern bool mips_mem_fits_mode_p (machine_mode mode, rtx x);
 extern HOST_WIDE_INT mips_debugger_offset (rtx, HOST_WIDE_INT);
 
 extern void mips_push_asm_switch (struct mips_asm_switch *);
@@ -278,27 +284,40 @@ extern void mips_expand_prologue (void);
 extern void mips_expand_before_return (void);
 extern void mips_expand_epilogue (bool);
 extern bool mips_can_use_return_insn (void);
-
-extern bool mips_cannot_change_mode_class (enum machine_mode,
-					   enum machine_mode, enum reg_class);
+extern bool mips_const_vector_same_val_p (rtx, machine_mode);
+extern bool mips_const_vector_same_byte_p (rtx, machine_mode);
+extern bool mips_const_vector_same_int_p (rtx, machine_mode, HOST_WIDE_INT,
+					  HOST_WIDE_INT);
+extern bool mips_const_vector_bitimm_set_p (rtx, machine_mode);
+extern bool mips_const_vector_bitimm_clr_p (rtx, machine_mode);
+extern bool mips_secondary_memory_needed (enum reg_class, enum reg_class,
+					  machine_mode);
+extern bool mips_cannot_change_mode_class (machine_mode,
+					   machine_mode, enum reg_class);
 extern bool mips_dangerous_for_la25_p (rtx);
-extern bool mips_modes_tieable_p (enum machine_mode, enum machine_mode);
+extern bool mips_modes_tieable_p (machine_mode, machine_mode);
 extern enum reg_class mips_secondary_reload_class (enum reg_class,
-						   enum machine_mode,
+						   machine_mode,
 						   rtx, bool);
-extern int mips_class_max_nregs (enum reg_class, enum machine_mode);
+extern int mips_class_max_nregs (enum reg_class, machine_mode);
 
+extern machine_mode mips_hard_regno_caller_save_mode (unsigned int,
+						      unsigned int,
+						      machine_mode);
 extern int mips_adjust_insn_length (rtx, int);
 extern void mips_output_load_label (rtx);
 extern const char *mips_output_conditional_branch (rtx, rtx *, const char *,
 						   const char *);
+extern const char *mips_output_jump (rtx *, int, int, bool);
+extern const char *mips_output_equal_conditional_branch (rtx, rtx *, bool);
 extern const char *mips_output_order_conditional_branch (rtx, rtx *, bool);
 extern const char *mips_output_sync (void);
 extern const char *mips_output_sync_loop (rtx, rtx *);
 extern unsigned int mips_sync_loop_insns (rtx, rtx *);
 extern const char *mips_output_division (const char *, rtx *);
+extern const char *mips_msa_output_division (const char *, rtx *);
 extern const char *mips_output_probe_stack_range (rtx, rtx);
-extern unsigned int mips_hard_regno_nregs (int, enum machine_mode);
+extern unsigned int mips_hard_regno_nregs (int, machine_mode);
 extern bool mips_linked_madd_p (rtx, rtx);
 extern bool mips_store_data_bypass_p (rtx, rtx);
 extern int mips_dspalu_bypass_p (rtx, rtx);
@@ -312,9 +331,10 @@ extern const char *mips16e_output_save_restore (rtx, HOST_WIDE_INT);
 extern bool mips16e_save_restore_pattern_p (rtx, HOST_WIDE_INT,
 					    struct mips16e_save_restore_info *);
 
-extern bool mask_low_and_shift_p (enum machine_mode, rtx, rtx, int);
-extern int mask_low_and_shift_len (enum machine_mode, rtx, rtx);
-extern bool and_operands_ok (enum machine_mode, rtx, rtx);
+extern bool mask_low_and_shift_p (machine_mode, rtx, rtx, int);
+extern int mask_low_and_shift_len (machine_mode, rtx, rtx);
+extern bool and_operands_ok (machine_mode, rtx, rtx);
+extern bool mips_fmadd_bypass (rtx, rtx);
 
 union mips_gen_fn_ptrs
 {
@@ -333,6 +353,7 @@ extern void mips_expand_vec_reduc (rtx, rtx, rtx (*)(rtx, rtx, rtx));
 extern void mips_expand_vec_minmax (rtx, rtx, rtx,
 				    rtx (*) (rtx, rtx, rtx), bool);
 
+extern int mips_ldst_scaled_shift (machine_mode);
 extern bool mips_signed_immediate_p (unsigned HOST_WIDE_INT, int, int);
 extern bool mips_unsigned_immediate_p (unsigned HOST_WIDE_INT, int, int);
 extern const char *umips_output_save_restore (bool, rtx);
@@ -340,9 +361,10 @@ extern bool umips_save_restore_pattern_p (bool, rtx);
 extern bool umips_load_store_pair_p (bool, rtx *);
 extern void umips_output_load_store_pair (bool, rtx *);
 extern bool umips_movep_target_p (rtx, rtx);
-extern bool umips_12bit_offset_address_p (rtx, enum machine_mode);
-extern bool lwsp_swsp_address_p (rtx, enum machine_mode);
-extern bool m16_based_address_p (rtx, enum machine_mode,
+extern bool umips_12bit_offset_address_p (rtx, machine_mode);
+extern bool mips_9bit_offset_address_p (rtx, machine_mode);
+extern bool lwsp_swsp_address_p (rtx, machine_mode);
+extern bool m16_based_address_p (rtx, machine_mode,
 			         int (*)(rtx_def*, machine_mode)); 
 extern rtx mips_expand_thread_pointer (rtx);
 extern void mips16_expand_get_fcsr (rtx);
@@ -353,10 +375,18 @@ extern bool mips_epilogue_uses (unsigned int);
 extern void mips_final_prescan_insn (rtx, rtx *, int);
 extern int mips_trampoline_code_size (void);
 extern void mips_function_profiler (FILE *);
+extern bool mips_load_store_bonding_p (rtx *, machine_mode, bool);
 
 typedef rtx (*mulsidi3_gen_fn) (rtx, rtx, rtx);
 #ifdef RTX_CODE
 extern mulsidi3_gen_fn mips_mulsidi3_gen_fn (enum rtx_code);
 #endif
 
+extern void mips_expand_vec_cond_expr (machine_mode,
+				       machine_mode,
+				       rtx *,
+				       rtx (*)(rtx, rtx, rtx),
+				       rtx (*)(rtx, rtx, rtx),
+				       rtx (*)(rtx, rtx, rtx));
+
 #endif /* ! GCC_MIPS_PROTOS_H */
diff --git a/gcc/config/mips/mips-tables.opt b/gcc/config/mips/mips-tables.opt
index 760b764e3ea..36156fd60cd 100644
--- a/gcc/config/mips/mips-tables.opt
+++ b/gcc/config/mips/mips-tables.opt
@@ -70,575 +70,620 @@ EnumValue
 Enum(mips_mips_opt_value) String(32r2) Value(5)
 
 EnumValue
-Enum(mips_arch_opt_value) String(mips64) Value(6) Canonical
+Enum(mips_arch_opt_value) String(mips32r3) Value(6) Canonical
 
 EnumValue
-Enum(mips_mips_opt_value) String(64) Value(6)
+Enum(mips_mips_opt_value) String(32r3) Value(6)
 
 EnumValue
-Enum(mips_arch_opt_value) String(mips64r2) Value(7) Canonical
+Enum(mips_arch_opt_value) String(mips32r5) Value(7) Canonical
 
 EnumValue
-Enum(mips_mips_opt_value) String(64r2) Value(7)
+Enum(mips_mips_opt_value) String(32r5) Value(7)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r3000) Value(8) Canonical
+Enum(mips_arch_opt_value) String(mips32r6) Value(8) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r3k) Value(8)
+Enum(mips_mips_opt_value) String(32r6) Value(8)
 
 EnumValue
-Enum(mips_arch_opt_value) String(3000) Value(8)
+Enum(mips_arch_opt_value) String(mips64) Value(9) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(3k) Value(8)
+Enum(mips_mips_opt_value) String(64) Value(9)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r2000) Value(9) Canonical
+Enum(mips_arch_opt_value) String(mips64r2) Value(10) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r2k) Value(9)
+Enum(mips_mips_opt_value) String(64r2) Value(10)
 
 EnumValue
-Enum(mips_arch_opt_value) String(2000) Value(9)
+Enum(mips_arch_opt_value) String(mips64r3) Value(11) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(2k) Value(9)
+Enum(mips_mips_opt_value) String(64r3) Value(11)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r3900) Value(10) Canonical
+Enum(mips_arch_opt_value) String(mips64r5) Value(12) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(3900) Value(10)
+Enum(mips_mips_opt_value) String(64r5) Value(12)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r6000) Value(11) Canonical
+Enum(mips_arch_opt_value) String(mips64r6) Value(13) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r6k) Value(11)
+Enum(mips_mips_opt_value) String(64r6) Value(13)
 
 EnumValue
-Enum(mips_arch_opt_value) String(6000) Value(11)
+Enum(mips_arch_opt_value) String(r3000) Value(14) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(6k) Value(11)
+Enum(mips_arch_opt_value) String(r3k) Value(14)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4000) Value(12) Canonical
+Enum(mips_arch_opt_value) String(3000) Value(14)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4k) Value(12)
+Enum(mips_arch_opt_value) String(3k) Value(14)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4000) Value(12)
+Enum(mips_arch_opt_value) String(r2000) Value(15) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4k) Value(12)
+Enum(mips_arch_opt_value) String(r2k) Value(15)
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr4100) Value(13) Canonical
+Enum(mips_arch_opt_value) String(2000) Value(15)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4100) Value(13)
+Enum(mips_arch_opt_value) String(2k) Value(15)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4100) Value(13)
+Enum(mips_arch_opt_value) String(r3900) Value(16) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr4111) Value(14) Canonical
+Enum(mips_arch_opt_value) String(3900) Value(16)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4111) Value(14)
+Enum(mips_arch_opt_value) String(r6000) Value(17) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4111) Value(14)
+Enum(mips_arch_opt_value) String(r6k) Value(17)
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr4120) Value(15) Canonical
+Enum(mips_arch_opt_value) String(6000) Value(17)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4120) Value(15)
+Enum(mips_arch_opt_value) String(6k) Value(17)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4120) Value(15)
+Enum(mips_arch_opt_value) String(r4000) Value(18) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr4130) Value(16) Canonical
+Enum(mips_arch_opt_value) String(r4k) Value(18)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4130) Value(16)
+Enum(mips_arch_opt_value) String(4000) Value(18)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4130) Value(16)
+Enum(mips_arch_opt_value) String(4k) Value(18)
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr4300) Value(17) Canonical
+Enum(mips_arch_opt_value) String(vr4100) Value(19) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4300) Value(17)
+Enum(mips_arch_opt_value) String(4100) Value(19)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4300) Value(17)
+Enum(mips_arch_opt_value) String(r4100) Value(19)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4400) Value(18) Canonical
+Enum(mips_arch_opt_value) String(vr4111) Value(20) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4400) Value(18)
+Enum(mips_arch_opt_value) String(4111) Value(20)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4600) Value(19) Canonical
+Enum(mips_arch_opt_value) String(r4111) Value(20)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4600) Value(19)
+Enum(mips_arch_opt_value) String(vr4120) Value(21) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(orion) Value(20) Canonical
+Enum(mips_arch_opt_value) String(4120) Value(21)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4650) Value(21) Canonical
+Enum(mips_arch_opt_value) String(r4120) Value(21)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4650) Value(21)
+Enum(mips_arch_opt_value) String(vr4130) Value(22) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4700) Value(22) Canonical
+Enum(mips_arch_opt_value) String(4130) Value(22)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4700) Value(22)
+Enum(mips_arch_opt_value) String(r4130) Value(22)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5900) Value(23) Canonical
+Enum(mips_arch_opt_value) String(vr4300) Value(23) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(5900) Value(23)
+Enum(mips_arch_opt_value) String(4300) Value(23)
 
 EnumValue
-Enum(mips_arch_opt_value) String(loongson2e) Value(24) Canonical
+Enum(mips_arch_opt_value) String(r4300) Value(23)
 
 EnumValue
-Enum(mips_arch_opt_value) String(loongson2f) Value(25) Canonical
+Enum(mips_arch_opt_value) String(r4400) Value(24) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r8000) Value(26) Canonical
+Enum(mips_arch_opt_value) String(4400) Value(24)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r8k) Value(26)
+Enum(mips_arch_opt_value) String(r4600) Value(25) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(8000) Value(26)
+Enum(mips_arch_opt_value) String(4600) Value(25)
 
 EnumValue
-Enum(mips_arch_opt_value) String(8k) Value(26)
+Enum(mips_arch_opt_value) String(orion) Value(26) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r10000) Value(27) Canonical
+Enum(mips_arch_opt_value) String(r4650) Value(27) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r10k) Value(27)
+Enum(mips_arch_opt_value) String(4650) Value(27)
 
 EnumValue
-Enum(mips_arch_opt_value) String(10000) Value(27)
+Enum(mips_arch_opt_value) String(r4700) Value(28) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(10k) Value(27)
+Enum(mips_arch_opt_value) String(4700) Value(28)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r12000) Value(28) Canonical
+Enum(mips_arch_opt_value) String(r5900) Value(29) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r12k) Value(28)
+Enum(mips_arch_opt_value) String(5900) Value(29)
 
 EnumValue
-Enum(mips_arch_opt_value) String(12000) Value(28)
+Enum(mips_arch_opt_value) String(loongson2e) Value(30) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(12k) Value(28)
+Enum(mips_arch_opt_value) String(loongson2f) Value(31) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r14000) Value(29) Canonical
+Enum(mips_arch_opt_value) String(r8000) Value(32) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r14k) Value(29)
+Enum(mips_arch_opt_value) String(r8k) Value(32)
 
 EnumValue
-Enum(mips_arch_opt_value) String(14000) Value(29)
+Enum(mips_arch_opt_value) String(8000) Value(32)
 
 EnumValue
-Enum(mips_arch_opt_value) String(14k) Value(29)
+Enum(mips_arch_opt_value) String(8k) Value(32)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r16000) Value(30) Canonical
+Enum(mips_arch_opt_value) String(r10000) Value(33) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r16k) Value(30)
+Enum(mips_arch_opt_value) String(r10k) Value(33)
 
 EnumValue
-Enum(mips_arch_opt_value) String(16000) Value(30)
+Enum(mips_arch_opt_value) String(10000) Value(33)
 
 EnumValue
-Enum(mips_arch_opt_value) String(16k) Value(30)
+Enum(mips_arch_opt_value) String(10k) Value(33)
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr5000) Value(31) Canonical
+Enum(mips_arch_opt_value) String(r12000) Value(34) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr5k) Value(31)
+Enum(mips_arch_opt_value) String(r12k) Value(34)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5000) Value(31)
+Enum(mips_arch_opt_value) String(12000) Value(34)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5k) Value(31)
+Enum(mips_arch_opt_value) String(12k) Value(34)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5000) Value(31)
+Enum(mips_arch_opt_value) String(r14000) Value(35) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5k) Value(31)
+Enum(mips_arch_opt_value) String(r14k) Value(35)
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr5400) Value(32) Canonical
+Enum(mips_arch_opt_value) String(14000) Value(35)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5400) Value(32)
+Enum(mips_arch_opt_value) String(14k) Value(35)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5400) Value(32)
+Enum(mips_arch_opt_value) String(r16000) Value(36) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(vr5500) Value(33) Canonical
+Enum(mips_arch_opt_value) String(r16k) Value(36)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5500) Value(33)
+Enum(mips_arch_opt_value) String(16000) Value(36)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5500) Value(33)
+Enum(mips_arch_opt_value) String(16k) Value(36)
 
 EnumValue
-Enum(mips_arch_opt_value) String(rm7000) Value(34) Canonical
+Enum(mips_arch_opt_value) String(vr5000) Value(37) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(rm7k) Value(34)
+Enum(mips_arch_opt_value) String(vr5k) Value(37)
 
 EnumValue
-Enum(mips_arch_opt_value) String(7000) Value(34)
+Enum(mips_arch_opt_value) String(5000) Value(37)
 
 EnumValue
-Enum(mips_arch_opt_value) String(7k) Value(34)
+Enum(mips_arch_opt_value) String(5k) Value(37)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r7000) Value(34)
+Enum(mips_arch_opt_value) String(r5000) Value(37)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r7k) Value(34)
+Enum(mips_arch_opt_value) String(r5k) Value(37)
 
 EnumValue
-Enum(mips_arch_opt_value) String(rm9000) Value(35) Canonical
+Enum(mips_arch_opt_value) String(vr5400) Value(38) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(rm9k) Value(35)
+Enum(mips_arch_opt_value) String(5400) Value(38)
 
 EnumValue
-Enum(mips_arch_opt_value) String(9000) Value(35)
+Enum(mips_arch_opt_value) String(r5400) Value(38)
 
 EnumValue
-Enum(mips_arch_opt_value) String(9k) Value(35)
+Enum(mips_arch_opt_value) String(vr5500) Value(39) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r9000) Value(35)
+Enum(mips_arch_opt_value) String(5500) Value(39)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r9k) Value(35)
+Enum(mips_arch_opt_value) String(r5500) Value(39)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4kc) Value(36) Canonical
+Enum(mips_arch_opt_value) String(rm7000) Value(40) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4kc) Value(36)
+Enum(mips_arch_opt_value) String(rm7k) Value(40)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4km) Value(37) Canonical
+Enum(mips_arch_opt_value) String(7000) Value(40)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4km) Value(37)
+Enum(mips_arch_opt_value) String(7k) Value(40)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4kp) Value(38) Canonical
+Enum(mips_arch_opt_value) String(r7000) Value(40)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4kp) Value(38)
+Enum(mips_arch_opt_value) String(r7k) Value(40)
 
 EnumValue
-Enum(mips_arch_opt_value) String(4ksc) Value(39) Canonical
+Enum(mips_arch_opt_value) String(rm9000) Value(41) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4ksc) Value(39)
+Enum(mips_arch_opt_value) String(rm9k) Value(41)
 
 EnumValue
-Enum(mips_arch_opt_value) String(m4k) Value(40) Canonical
+Enum(mips_arch_opt_value) String(9000) Value(41)
 
 EnumValue
-Enum(mips_arch_opt_value) String(m14kc) Value(41) Canonical
+Enum(mips_arch_opt_value) String(9k) Value(41)
 
 EnumValue
-Enum(mips_arch_opt_value) String(m14k) Value(42) Canonical
+Enum(mips_arch_opt_value) String(r9000) Value(41)
 
 EnumValue
-Enum(mips_arch_opt_value) String(m14ke) Value(43) Canonical
+Enum(mips_arch_opt_value) String(r9k) Value(41)
 
 EnumValue
-Enum(mips_arch_opt_value) String(m14kec) Value(44) Canonical
+Enum(mips_arch_opt_value) String(4kc) Value(42) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4kec) Value(45) Canonical
+Enum(mips_arch_opt_value) String(r4kc) Value(42)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4kec) Value(45)
+Enum(mips_arch_opt_value) String(4km) Value(43) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4kem) Value(46) Canonical
+Enum(mips_arch_opt_value) String(r4km) Value(43)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4kem) Value(46)
+Enum(mips_arch_opt_value) String(4kp) Value(44) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4kep) Value(47) Canonical
+Enum(mips_arch_opt_value) String(r4kp) Value(44)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4kep) Value(47)
+Enum(mips_arch_opt_value) String(4ksc) Value(45) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(4ksd) Value(48) Canonical
+Enum(mips_arch_opt_value) String(r4ksc) Value(45)
 
 EnumValue
-Enum(mips_arch_opt_value) String(r4ksd) Value(48)
+Enum(mips_arch_opt_value) String(m4k) Value(46) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kc) Value(49) Canonical
+Enum(mips_arch_opt_value) String(m14kc) Value(47) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kc) Value(49)
+Enum(mips_arch_opt_value) String(m14k) Value(48) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kf2_1) Value(50) Canonical
+Enum(mips_arch_opt_value) String(m14ke) Value(49) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kf2_1) Value(50)
+Enum(mips_arch_opt_value) String(m14kec) Value(50) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kf) Value(51) Canonical
+Enum(mips_arch_opt_value) String(4kec) Value(51) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kf) Value(51)
+Enum(mips_arch_opt_value) String(r4kec) Value(51)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kf1_1) Value(52) Canonical
+Enum(mips_arch_opt_value) String(4kem) Value(52) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kf1_1) Value(52)
+Enum(mips_arch_opt_value) String(r4kem) Value(52)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kfx) Value(53) Canonical
+Enum(mips_arch_opt_value) String(4kep) Value(53) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kfx) Value(53)
+Enum(mips_arch_opt_value) String(r4kep) Value(53)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kx) Value(54) Canonical
+Enum(mips_arch_opt_value) String(4ksd) Value(54) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kx) Value(54)
+Enum(mips_arch_opt_value) String(r4ksd) Value(54)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kec) Value(55) Canonical
+Enum(mips_arch_opt_value) String(24kc) Value(55) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kec) Value(55)
+Enum(mips_arch_opt_value) String(r24kc) Value(55)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kef2_1) Value(56) Canonical
+Enum(mips_arch_opt_value) String(24kf2_1) Value(56) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kef2_1) Value(56)
+Enum(mips_arch_opt_value) String(r24kf2_1) Value(56)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kef) Value(57) Canonical
+Enum(mips_arch_opt_value) String(24kf) Value(57) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kef) Value(57)
+Enum(mips_arch_opt_value) String(r24kf) Value(57)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kef1_1) Value(58) Canonical
+Enum(mips_arch_opt_value) String(24kf1_1) Value(58) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kef1_1) Value(58)
+Enum(mips_arch_opt_value) String(r24kf1_1) Value(58)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kefx) Value(59) Canonical
+Enum(mips_arch_opt_value) String(24kfx) Value(59) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kefx) Value(59)
+Enum(mips_arch_opt_value) String(r24kfx) Value(59)
 
 EnumValue
-Enum(mips_arch_opt_value) String(24kex) Value(60) Canonical
+Enum(mips_arch_opt_value) String(24kx) Value(60) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r24kex) Value(60)
+Enum(mips_arch_opt_value) String(r24kx) Value(60)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kc) Value(61) Canonical
+Enum(mips_arch_opt_value) String(24kec) Value(61) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kc) Value(61)
+Enum(mips_arch_opt_value) String(r24kec) Value(61)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kf2_1) Value(62) Canonical
+Enum(mips_arch_opt_value) String(24kef2_1) Value(62) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kf2_1) Value(62)
+Enum(mips_arch_opt_value) String(r24kef2_1) Value(62)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kf) Value(63) Canonical
+Enum(mips_arch_opt_value) String(24kef) Value(63) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kf) Value(63)
+Enum(mips_arch_opt_value) String(r24kef) Value(63)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kf1_1) Value(64) Canonical
+Enum(mips_arch_opt_value) String(24kef1_1) Value(64) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kf1_1) Value(64)
+Enum(mips_arch_opt_value) String(r24kef1_1) Value(64)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kfx) Value(65) Canonical
+Enum(mips_arch_opt_value) String(24kefx) Value(65) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kfx) Value(65)
+Enum(mips_arch_opt_value) String(r24kefx) Value(65)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kx) Value(66) Canonical
+Enum(mips_arch_opt_value) String(24kex) Value(66) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kx) Value(66)
+Enum(mips_arch_opt_value) String(r24kex) Value(66)
 
 EnumValue
-Enum(mips_arch_opt_value) String(34kn) Value(67) Canonical
+Enum(mips_arch_opt_value) String(34kc) Value(67) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r34kn) Value(67)
+Enum(mips_arch_opt_value) String(r34kc) Value(67)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kc) Value(68) Canonical
+Enum(mips_arch_opt_value) String(34kf2_1) Value(68) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kc) Value(68)
+Enum(mips_arch_opt_value) String(r34kf2_1) Value(68)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kf2_1) Value(69) Canonical
+Enum(mips_arch_opt_value) String(34kf) Value(69) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kf2_1) Value(69)
+Enum(mips_arch_opt_value) String(r34kf) Value(69)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kf) Value(70) Canonical
+Enum(mips_arch_opt_value) String(34kf1_1) Value(70) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kf) Value(70)
+Enum(mips_arch_opt_value) String(r34kf1_1) Value(70)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kf1_1) Value(71) Canonical
+Enum(mips_arch_opt_value) String(34kfx) Value(71) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kf1_1) Value(71)
+Enum(mips_arch_opt_value) String(r34kfx) Value(71)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kfx) Value(72) Canonical
+Enum(mips_arch_opt_value) String(34kx) Value(72) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kfx) Value(72)
+Enum(mips_arch_opt_value) String(r34kx) Value(72)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kx) Value(73) Canonical
+Enum(mips_arch_opt_value) String(34kn) Value(73) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kx) Value(73)
+Enum(mips_arch_opt_value) String(r34kn) Value(73)
 
 EnumValue
-Enum(mips_arch_opt_value) String(74kf3_2) Value(74) Canonical
+Enum(mips_arch_opt_value) String(74kc) Value(74) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r74kf3_2) Value(74)
+Enum(mips_arch_opt_value) String(r74kc) Value(74)
 
 EnumValue
-Enum(mips_arch_opt_value) String(1004kc) Value(75) Canonical
+Enum(mips_arch_opt_value) String(74kf2_1) Value(75) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r1004kc) Value(75)
+Enum(mips_arch_opt_value) String(r74kf2_1) Value(75)
 
 EnumValue
-Enum(mips_arch_opt_value) String(1004kf2_1) Value(76) Canonical
+Enum(mips_arch_opt_value) String(74kf) Value(76) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r1004kf2_1) Value(76)
+Enum(mips_arch_opt_value) String(r74kf) Value(76)
 
 EnumValue
-Enum(mips_arch_opt_value) String(1004kf) Value(77) Canonical
+Enum(mips_arch_opt_value) String(74kf1_1) Value(77) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r1004kf) Value(77)
+Enum(mips_arch_opt_value) String(r74kf1_1) Value(77)
 
 EnumValue
-Enum(mips_arch_opt_value) String(1004kf1_1) Value(78) Canonical
+Enum(mips_arch_opt_value) String(74kfx) Value(78) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r1004kf1_1) Value(78)
+Enum(mips_arch_opt_value) String(r74kfx) Value(78)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5kc) Value(79) Canonical
+Enum(mips_arch_opt_value) String(74kx) Value(79) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5kc) Value(79)
+Enum(mips_arch_opt_value) String(r74kx) Value(79)
 
 EnumValue
-Enum(mips_arch_opt_value) String(5kf) Value(80) Canonical
+Enum(mips_arch_opt_value) String(74kf3_2) Value(80) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r5kf) Value(80)
+Enum(mips_arch_opt_value) String(r74kf3_2) Value(80)
 
 EnumValue
-Enum(mips_arch_opt_value) String(20kc) Value(81) Canonical
+Enum(mips_arch_opt_value) String(1004kc) Value(81) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(r20kc) Value(81)
+Enum(mips_arch_opt_value) String(r1004kc) Value(81)
 
 EnumValue
-Enum(mips_arch_opt_value) String(sb1) Value(82) Canonical
+Enum(mips_arch_opt_value) String(1004kf2_1) Value(82) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(sb1a) Value(83) Canonical
+Enum(mips_arch_opt_value) String(r1004kf2_1) Value(82)
 
 EnumValue
-Enum(mips_arch_opt_value) String(sr71000) Value(84) Canonical
+Enum(mips_arch_opt_value) String(1004kf) Value(83) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(sr71k) Value(84)
+Enum(mips_arch_opt_value) String(r1004kf) Value(83)
 
 EnumValue
-Enum(mips_arch_opt_value) String(xlr) Value(85) Canonical
+Enum(mips_arch_opt_value) String(1004kf1_1) Value(84) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(loongson3a) Value(86) Canonical
+Enum(mips_arch_opt_value) String(r1004kf1_1) Value(84)
 
 EnumValue
-Enum(mips_arch_opt_value) String(octeon) Value(87) Canonical
+Enum(mips_arch_opt_value) String(p5600) Value(85) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(octeon+) Value(88) Canonical
+Enum(mips_arch_opt_value) String(5kc) Value(86) Canonical
 
 EnumValue
-Enum(mips_arch_opt_value) String(octeon2) Value(89) Canonical
+Enum(mips_arch_opt_value) String(r5kc) Value(86)
 
 EnumValue
-Enum(mips_arch_opt_value) String(xlp) Value(90) Canonical
+Enum(mips_arch_opt_value) String(5kf) Value(87) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(r5kf) Value(87)
+
+EnumValue
+Enum(mips_arch_opt_value) String(20kc) Value(88) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(r20kc) Value(88)
+
+EnumValue
+Enum(mips_arch_opt_value) String(sb1) Value(89) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(sb1a) Value(90) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(sr71000) Value(91) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(sr71k) Value(91)
+
+EnumValue
+Enum(mips_arch_opt_value) String(xlr) Value(92) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(loongson3a) Value(93) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(octeon) Value(94) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(octeon+) Value(95) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(octeon2) Value(96) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(octeon3) Value(97) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(xlp) Value(98) Canonical
+
+EnumValue
+Enum(mips_arch_opt_value) String(i6400) Value(99) Canonical
 
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 143169bc150..524c6d52d93 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -72,6 +72,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "context.h"
 
+/* Definitions used in ready queue reordering for first scheduling pass.  */
+
+static int *level = NULL;
+static int *consumer_luid = NULL;
+
+#define LEVEL(INSN)	\
+  level[INSN_UID ((INSN))]
+
+#define CONSUMER_LUID(INSN)	\
+  consumer_luid[INSN_UID ((INSN))]
+
 /* True if X is an UNSPEC wrapper around a SYMBOL_REF or LABEL_REF.  */
 #define UNSPEC_ADDRESS_P(X)					\
   (GET_CODE (X) == UNSPEC					\
@@ -162,9 +173,11 @@ along with GCC; see the file COPYING3.  If not see
 #define MIPS_LUI(DEST, VALUE) \
   ((0xf << 26) | ((DEST) << 16) | (VALUE))
 
-/* Return the opcode to jump to register DEST.  */
+/* Return the opcode to jump to register DEST.  When the JR opcode is not
+   available use JALR $0, DEST.  */
 #define MIPS_JR(DEST) \
-  (((DEST) << 21) | 0x8)
+  (TARGET_CB_ALWAYS ? ((0x1b << 27) | ((DEST) << 16)) \
+		    : (((DEST) << 21) | (ISA_HAS_JR ? 0x8 : 0x9)))
 
 /* Return the opcode for:
 
@@ -585,6 +598,10 @@ const struct mips_cpu_info *mips_tune_info;
 /* The ISA level associated with mips_arch.  */
 int mips_isa;
 
+/* The ISA revision level.  This is 0 for MIPS I to V and N for
+   MIPS{32,64}rN.  */
+int mips_isa_rev;
+
 /* The architecture selected by -mipsN, or null if -mipsN wasn't used.  */
 static const struct mips_cpu_info *mips_isa_option_info;
 
@@ -648,14 +665,15 @@ static mips_one_only_stub *mips16_set_fcsr_stub;
 
 /* Index R is the smallest register class that contains register R.  */
 const enum reg_class mips_regno_to_class[FIRST_PSEUDO_REGISTER] = {
-  LEA_REGS,	LEA_REGS,	M16_REGS,	V1_REG,
-  M16_REGS,	M16_REGS,	M16_REGS,	M16_REGS,
-  LEA_REGS,	LEA_REGS,	LEA_REGS,	LEA_REGS,
-  LEA_REGS,	LEA_REGS,	LEA_REGS,	LEA_REGS,
-  M16_REGS,	M16_REGS,	LEA_REGS,	LEA_REGS,
-  LEA_REGS,	LEA_REGS,	LEA_REGS,	LEA_REGS,
-  T_REG,	PIC_FN_ADDR_REG, LEA_REGS,	LEA_REGS,
-  LEA_REGS,	LEA_REGS,	LEA_REGS,	LEA_REGS,
+  LEA_REGS,        LEA_REGS,        M16_STORE_REGS,  V1_REG,
+  M16_STORE_REGS,  M16_STORE_REGS,  M16_STORE_REGS,  M16_STORE_REGS,
+  LEA_REGS,        LEA_REGS,        LEA_REGS,        LEA_REGS,
+  LEA_REGS,        LEA_REGS,        LEA_REGS,        LEA_REGS,
+  M16_REGS,        M16_STORE_REGS,  LEA_REGS,        LEA_REGS,
+  LEA_REGS,        LEA_REGS,        LEA_REGS,        LEA_REGS,
+  T_REG,           PIC_FN_ADDR_REG, LEA_REGS,        LEA_REGS,
+  LEA_REGS,        M16_SP_REGS,     LEA_REGS,        LEA_REGS,
+
   FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
   FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
   FP_REGS,	FP_REGS,	FP_REGS,	FP_REGS,
@@ -951,6 +969,20 @@ static const struct mips_rtx_cost_data
                      4,		  /* branch_cost */
                      4		  /* memory_latency */
   },
+    /* Octeon III */
+  {
+    COSTS_N_INSNS (6),            /* fp_add */
+    COSTS_N_INSNS (6),            /* fp_mult_sf */
+    COSTS_N_INSNS (7),            /* fp_mult_df */
+    COSTS_N_INSNS (25),           /* fp_div_sf */
+    COSTS_N_INSNS (48),           /* fp_div_df */
+    COSTS_N_INSNS (6),            /* int_mult_si */
+    COSTS_N_INSNS (6),            /* int_mult_di */
+    COSTS_N_INSNS (18),           /* int_div_si */
+    COSTS_N_INSNS (35),           /* int_div_di */
+                     4,		  /* branch_cost */
+                     4		  /* memory_latency */
+  },
   { /* R3900 */
     COSTS_N_INSNS (2),            /* fp_add */
     COSTS_N_INSNS (4),            /* fp_mult_sf */
@@ -1173,13 +1205,66 @@ static const struct mips_rtx_cost_data
     COSTS_N_INSNS (68),           /* int_div_di */
 		     1,           /* branch_cost */
 		     4            /* memory_latency */
+  },
+  { /* P5600 */
+    COSTS_N_INSNS (4),            /* fp_add */
+    COSTS_N_INSNS (5),            /* fp_mult_sf */
+    COSTS_N_INSNS (5),            /* fp_mult_df */
+    COSTS_N_INSNS (17),           /* fp_div_sf */
+    COSTS_N_INSNS (17),           /* fp_div_df */
+    COSTS_N_INSNS (5),            /* int_mult_si */
+    COSTS_N_INSNS (5),            /* int_mult_di */
+    COSTS_N_INSNS (8),            /* int_div_si */
+    COSTS_N_INSNS (8),            /* int_div_di */
+		    2,            /* branch_cost */
+		    4             /* memory_latency */
+  },
+  { /* W32 */
+    COSTS_N_INSNS (4),            /* fp_add */
+    COSTS_N_INSNS (4),            /* fp_mult_sf */
+    COSTS_N_INSNS (5),            /* fp_mult_df */
+    COSTS_N_INSNS (17),           /* fp_div_sf */
+    COSTS_N_INSNS (32),           /* fp_div_df */
+    COSTS_N_INSNS (5),            /* int_mult_si */
+    COSTS_N_INSNS (5),            /* int_mult_di */
+    COSTS_N_INSNS (41),           /* int_div_si */
+    COSTS_N_INSNS (41),           /* int_div_di */
+		     1,           /* branch_cost */
+		     4            /* memory_latency */
+  },
+  { /* W64 */
+    COSTS_N_INSNS (4),            /* fp_add */
+    COSTS_N_INSNS (4),            /* fp_mult_sf */
+    COSTS_N_INSNS (5),            /* fp_mult_df */
+    COSTS_N_INSNS (17),           /* fp_div_sf */
+    COSTS_N_INSNS (32),           /* fp_div_df */
+    COSTS_N_INSNS (5),            /* int_mult_si */
+    COSTS_N_INSNS (5),            /* int_mult_di */
+    COSTS_N_INSNS (41),           /* int_div_si */
+    COSTS_N_INSNS (41),           /* int_div_di */
+		     1,           /* branch_cost */
+		     4            /* memory_latency */
+  },
+  { /* I6400 */
+    COSTS_N_INSNS (4),            /* fp_add */
+    COSTS_N_INSNS (5),            /* fp_mult_sf */
+    COSTS_N_INSNS (5),            /* fp_mult_df */
+    COSTS_N_INSNS (32),           /* fp_div_sf */
+    COSTS_N_INSNS (32),           /* fp_div_df */
+    COSTS_N_INSNS (5),            /* int_mult_si */
+    COSTS_N_INSNS (5),            /* int_mult_di */
+    COSTS_N_INSNS (36),           /* int_div_si */
+    COSTS_N_INSNS (36),           /* int_div_di */
+		    2,            /* branch_cost */
+		    4             /* memory_latency */
   }
 };
 
 static rtx mips_find_pic_call_symbol (rtx, rtx, bool);
-static int mips_register_move_cost (enum machine_mode, reg_class_t,
+static int mips_register_move_cost (machine_mode, reg_class_t,
 				    reg_class_t);
-static unsigned int mips_function_arg_boundary (enum machine_mode, const_tree);
+static unsigned int mips_function_arg_boundary (machine_mode, const_tree);
+static machine_mode mips_get_reg_raw_mode (int regno);
 
 /* This hash table keeps track of implicit "mips16" and "nomips16" attributes
    for -mflip_mips16.  It maps decl names onto a boolean mode setting.  */
@@ -1607,7 +1692,7 @@ mips_build_integer (struct mips_integer_op *codes,
 /* Implement TARGET_LEGITIMATE_CONSTANT_P.  */
 
 static bool
-mips_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+mips_legitimate_constant_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
 {
   return mips_const_insns (x) > 0;
 }
@@ -1777,11 +1862,119 @@ mips_symbol_binds_local_p (const_rtx x)
 	  : SYMBOL_REF_LOCAL_P (x));
 }
 
+bool
+mips_const_vector_bitimm_set_p (rtx op, machine_mode mode)
+{
+  if (GET_CODE (op) == CONST_VECTOR && op != const0_rtx)
+    {
+      rtx elt0 = CONST_VECTOR_ELT (op, 0);
+      HOST_WIDE_INT val = INTVAL (elt0);
+      int vlog2 = exact_log2 (val);
+
+      if (vlog2 != -1)
+	{
+	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+	  if (!(0 <= vlog2 && vlog2 <= GET_MODE_UNIT_SIZE (mode) - 1))
+	    return false;
+
+	  return mips_const_vector_same_int_p (op, mode, 0, val);
+	}
+    }
+
+  return false;
+}
+
+bool
+mips_const_vector_bitimm_clr_p (rtx op, machine_mode mode)
+{
+  if (GET_CODE (op) == CONST_VECTOR && op != constm1_rtx)
+    {
+      rtx elt0 = CONST_VECTOR_ELT (op, 0);
+      HOST_WIDE_INT val = INTVAL (elt0);
+      int vlog2 = exact_log2 (~val);
+
+      if (vlog2 != -1)
+	{
+	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+	  if (!(0 <= vlog2 && vlog2 <= GET_MODE_UNIT_SIZE (mode) - 1))
+	    return false;
+
+	  return mips_const_vector_same_val_p (op, mode);
+	}
+  }
+
+  return false;
+}
+
+/* Return true if OP is a constant vector with the number of units in MODE,
+   and each unit has the same value.  */
+
+bool
+mips_const_vector_same_val_p (rtx op, machine_mode mode)
+{
+  int i, nunits = GET_MODE_NUNITS (mode);
+  rtx first;
+
+  if (GET_CODE (op) != CONST_VECTOR || GET_MODE (op) != mode)
+    return false;
+
+  first = CONST_VECTOR_ELT (op, 0);
+  for (i = 1; i < nunits; i++)
+    if (!rtx_equal_p (first, CONST_VECTOR_ELT (op, i)))
+      return false;
+
+  return true;
+}
+
+/* Return true if OP is a constant vector with the number of units in MODE,
+   and each unit has the same value.  */
+
+bool
+mips_const_vector_same_byte_p (rtx op, machine_mode mode)
+{
+  int i, nunits = GET_MODE_NUNITS (mode);
+  rtx first;
+
+  gcc_assert (mode == V16QImode);
+
+  if (GET_CODE (op) != CONST_VECTOR || GET_MODE (op) != mode)
+    return false;
+
+  first = CONST_VECTOR_ELT (op, 0);
+  for (i = 1; i < nunits; i++)
+    if (!rtx_equal_p (first, CONST_VECTOR_ELT (op, i)))
+      return false;
+
+  /* It's an 8-bit mode don't care if signed or unsigned.  */
+  return true;
+}
+
+/* Return true if OP is a constant vector with the number of units in MODE,
+   and each unit has the same integer value in the range [LOW, HIGH].  */
+
+bool
+mips_const_vector_same_int_p (rtx op, machine_mode mode, HOST_WIDE_INT low,
+			      HOST_WIDE_INT high)
+{
+  HOST_WIDE_INT value;
+  rtx elem0;
+
+  if (!mips_const_vector_same_val_p (op, mode))
+    return false;
+
+  elem0 = CONST_VECTOR_ELT (op, 0);
+  if (!CONST_INT_P (elem0))
+    return false;
+
+  value = INTVAL (elem0);
+  return (value >= low && value <= high);
+}
+
 /* Return true if rtx constants of mode MODE should be put into a small
    data section.  */
 
 static bool
-mips_rtx_constant_in_small_data_p (enum machine_mode mode)
+mips_rtx_constant_in_small_data_p (machine_mode mode)
 {
   return (!TARGET_EMBEDDED_DATA
 	  && TARGET_LOCAL_SDATA
@@ -2034,7 +2227,7 @@ mips_symbolic_constant_p (rtx x, enum mips_symbol_context context,
    extended ones.  */
 
 static int
-mips_symbol_insns_1 (enum mips_symbol_type type, enum machine_mode mode)
+mips_symbol_insns_1 (enum mips_symbol_type type, machine_mode mode)
 {
   if (mips_use_pcrel_pool_p[(int) type])
     {
@@ -2146,8 +2339,13 @@ mips_symbol_insns_1 (enum mips_symbol_type type, enum machine_mode mode)
    In both cases, instruction counts are based off BASE_INSN_LENGTH.  */
 
 static int
-mips_symbol_insns (enum mips_symbol_type type, enum machine_mode mode)
+mips_symbol_insns (enum mips_symbol_type type, machine_mode mode)
 {
+  /* MSA LD.* and ST.* cannot support loading symbols via an immediate
+     operand.  */
+  if (MSA_SUPPORTED_MODE_P (mode))
+    return 0;
+
   return mips_symbol_insns_1 (type, mode) * (TARGET_MIPS16 ? 2 : 1);
 }
 
@@ -2163,7 +2361,7 @@ mips_tls_symbol_ref_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
 /* Implement TARGET_CANNOT_FORCE_CONST_MEM.  */
 
 static bool
-mips_cannot_force_const_mem (enum machine_mode mode, rtx x)
+mips_cannot_force_const_mem (machine_mode mode, rtx x)
 {
   enum mips_symbol_type type;
   rtx base, offset;
@@ -2213,7 +2411,7 @@ mips_cannot_force_const_mem (enum machine_mode mode, rtx x)
    constants when we're using a per-function constant pool.  */
 
 static bool
-mips_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
+mips_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
 				const_rtx x ATTRIBUTE_UNUSED)
 {
   return !TARGET_MIPS16_PCREL_LOADS;
@@ -2223,7 +2421,7 @@ mips_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
 int
-mips_regno_mode_ok_for_base_p (int regno, enum machine_mode mode,
+mips_regno_mode_ok_for_base_p (int regno, machine_mode mode,
 			       bool strict_p)
 {
   if (!HARD_REGISTER_NUM_P (regno))
@@ -2240,22 +2438,9 @@ mips_regno_mode_ok_for_base_p (int regno, enum machine_mode mode,
     return true;
 
   /* In MIPS16 mode, the stack pointer can only address word and doubleword
-     values, nothing smaller.  There are two problems here:
-
-       (a) Instantiating virtual registers can introduce new uses of the
-	   stack pointer.  If these virtual registers are valid addresses,
-	   the stack pointer should be too.
-
-       (b) Most uses of the stack pointer are not made explicit until
-	   FRAME_POINTER_REGNUM and ARG_POINTER_REGNUM have been eliminated.
-	   We don't know until that stage whether we'll be eliminating to the
-	   stack pointer (which needs the restriction) or the hard frame
-	   pointer (which doesn't).
-
-     All in all, it seems more consistent to only enforce this restriction
-     during and after reload.  */
+     values, nothing smaller.  */
   if (TARGET_MIPS16 && regno == STACK_POINTER_REGNUM)
-    return !strict_p || GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8;
+    return GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8;
 
   return TARGET_MIPS16 ? M16_REG_P (regno) : GP_REG_P (regno);
 }
@@ -2264,7 +2449,7 @@ mips_regno_mode_ok_for_base_p (int regno, enum machine_mode mode,
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
 static bool
-mips_valid_base_register_p (rtx x, enum machine_mode mode, bool strict_p)
+mips_valid_base_register_p (rtx x, machine_mode mode, bool strict_p)
 {
   if (!strict_p && GET_CODE (x) == SUBREG)
     x = SUBREG_REG (x);
@@ -2277,7 +2462,7 @@ mips_valid_base_register_p (rtx x, enum machine_mode mode, bool strict_p)
    can address a value of mode MODE.  */
 
 static bool
-mips_valid_offset_p (rtx x, enum machine_mode mode)
+mips_valid_offset_p (rtx x, machine_mode mode)
 {
   /* Check that X is a signed 16-bit number.  */
   if (!const_arith_operand (x, Pmode))
@@ -2289,6 +2474,12 @@ mips_valid_offset_p (rtx x, enum machine_mode mode)
       && !SMALL_OPERAND (INTVAL (x) + GET_MODE_SIZE (mode) - UNITS_PER_WORD))
     return false;
 
+  /* MSA LD.* and ST.* supports 10-bit signed offsets.  */
+  if (MSA_SUPPORTED_MODE_P (mode)
+      && !mips_signed_immediate_p (INTVAL (x), 10,
+				   mips_ldst_scaled_shift (mode)))
+    return false;
+
   return true;
 }
 
@@ -2296,7 +2487,7 @@ mips_valid_offset_p (rtx x, enum machine_mode mode)
    LO_SUM symbol has type SYMBOL_TYPE.  */
 
 static bool
-mips_valid_lo_sum_p (enum mips_symbol_type symbol_type, enum machine_mode mode)
+mips_valid_lo_sum_p (enum mips_symbol_type symbol_type, machine_mode mode)
 {
   /* Check that symbols of type SYMBOL_TYPE can be used to access values
      of mode MODE.  */
@@ -2315,6 +2506,10 @@ mips_valid_lo_sum_p (enum mips_symbol_type symbol_type, enum machine_mode mode)
       && GET_MODE_BITSIZE (mode) > GET_MODE_ALIGNMENT (mode))
     return false;
 
+  /* MSA LD.* and ST.* cannot support loading symbols via %lo($base).  */
+  if (MSA_SUPPORTED_MODE_P (mode))
+    return false;
+
   return true;
 }
 
@@ -2324,7 +2519,7 @@ mips_valid_lo_sum_p (enum mips_symbol_type symbol_type, enum machine_mode mode)
 
 static bool
 mips_classify_address (struct mips_address_info *info, rtx x,
-		       enum machine_mode mode, bool strict_p)
+		       machine_mode mode, bool strict_p)
 {
   switch (GET_CODE (x))
     {
@@ -2383,7 +2578,7 @@ mips_classify_address (struct mips_address_info *info, rtx x,
 /* Implement TARGET_LEGITIMATE_ADDRESS_P.  */
 
 static bool
-mips_legitimate_address_p (enum machine_mode mode, rtx x, bool strict_p)
+mips_legitimate_address_p (machine_mode mode, rtx x, bool strict_p)
 {
   struct mips_address_info addr;
 
@@ -2393,7 +2588,7 @@ mips_legitimate_address_p (enum machine_mode mode, rtx x, bool strict_p)
 /* Return true if X is a legitimate $sp-based address for mode MDOE.  */
 
 bool
-mips_stack_address_p (rtx x, enum machine_mode mode)
+mips_stack_address_p (rtx x, machine_mode mode)
 {
   struct mips_address_info addr;
 
@@ -2430,7 +2625,7 @@ mips_lwxs_address_p (rtx addr)
    sense, because their use is so restricted.  */
 
 static bool
-mips_lx_address_p (rtx addr, enum machine_mode mode)
+mips_lx_address_p (rtx addr, machine_mode mode)
 {
   if (GET_CODE (addr) != PLUS
       || !REG_P (XEXP (addr, 0))
@@ -2444,6 +2639,8 @@ mips_lx_address_p (rtx addr, enum machine_mode mode)
     return true;
   if (ISA_HAS_LDX && mode == DImode)
     return true;
+  if (MSA_SUPPORTED_MODE_P (mode))
+    return true;
   return false;
 }
 
@@ -2457,7 +2654,7 @@ mips_lx_address_p (rtx addr, enum machine_mode mode)
    an 8-bit immediate field that's shifted left twice.  */
 
 static bool
-mips16_unextended_reference_p (enum machine_mode mode, rtx base,
+mips16_unextended_reference_p (machine_mode mode, rtx base,
 			       unsigned HOST_WIDE_INT offset)
 {
   if (mode != BLKmode && offset % GET_MODE_SIZE (mode) == 0)
@@ -2477,10 +2674,11 @@ mips16_unextended_reference_p (enum machine_mode mode, rtx base,
    enough.  */
 
 int
-mips_address_insns (rtx x, enum machine_mode mode, bool might_split_p)
+mips_address_insns (rtx x, machine_mode mode, bool might_split_p)
 {
   struct mips_address_info addr;
   int factor;
+  bool msa_p = (!might_split_p && MSA_SUPPORTED_MODE_P (mode));
 
   /* BLKmode is used for single unaligned loads and stores and should
      not count as a multiword mode.  (GET_MODE_SIZE (BLKmode) is pretty
@@ -2495,6 +2693,15 @@ mips_address_insns (rtx x, enum machine_mode mode, bool might_split_p)
     switch (addr.type)
       {
       case ADDRESS_REG:
+	if (msa_p)
+	  {
+	    /* MSA LD.* and ST.* supports 10-bit signed offsets.  */
+	    if (mips_signed_immediate_p (INTVAL (addr.offset), 10,
+					 mips_ldst_scaled_shift (mode)))
+	      return 1;
+	    else
+	      return 0;
+	  }
 	if (TARGET_MIPS16
 	    && !mips16_unextended_reference_p (mode, addr.reg,
 					       UINTVAL (addr.offset)))
@@ -2502,13 +2709,13 @@ mips_address_insns (rtx x, enum machine_mode mode, bool might_split_p)
 	return factor;
 
       case ADDRESS_LO_SUM:
-	return TARGET_MIPS16 ? factor * 2 : factor;
+	return msa_p ? 0 : TARGET_MIPS16 ? factor * 2 : factor;
 
       case ADDRESS_CONST_INT:
-	return factor;
+	return msa_p ? 0 : factor;
 
       case ADDRESS_SYMBOLIC:
-	return factor * mips_symbol_insns (addr.symbol_type, mode);
+	return msa_p ? 0 : factor * mips_symbol_insns (addr.symbol_type, mode);
       }
   return 0;
 }
@@ -2532,12 +2739,25 @@ mips_signed_immediate_p (unsigned HOST_WIDE_INT x, int bits, int shift = 0)
   return mips_unsigned_immediate_p (x, bits, shift);
 }
 
+/* Return the scale shift that applied to MSA LD/ST address offset.  */
+
+int
+mips_ldst_scaled_shift (machine_mode mode)
+{
+  int shift = exact_log2 (GET_MODE_UNIT_SIZE (mode));
+
+  if (shift < 0 || shift > 8)
+    gcc_unreachable ();
+
+  return shift;
+}
+
 /* Return true if X is legitimate for accessing values of mode MODE,
    if it is based on a MIPS16 register, and if the offset satisfies
    OFFSET_PREDICATE.  */
 
 bool
-m16_based_address_p (rtx x, enum machine_mode mode,
+m16_based_address_p (rtx x, machine_mode mode,
 		     insn_operand_predicate_fn offset_predicate)
 {
   struct mips_address_info addr;
@@ -2552,7 +2772,7 @@ m16_based_address_p (rtx x, enum machine_mode mode,
    for a microMIPS LWSP or SWSP insn.  */
 
 bool
-lwsp_swsp_address_p (rtx x, enum machine_mode mode)
+lwsp_swsp_address_p (rtx x, machine_mode mode)
 {
   struct mips_address_info addr;
 
@@ -2566,7 +2786,7 @@ lwsp_swsp_address_p (rtx x, enum machine_mode mode)
    MODE is the mode of the value being accessed.  */
 
 bool
-umips_12bit_offset_address_p (rtx x, enum machine_mode mode)
+umips_12bit_offset_address_p (rtx x, machine_mode mode)
 {
   struct mips_address_info addr;
 
@@ -2576,6 +2796,20 @@ umips_12bit_offset_address_p (rtx x, enum machine_mode mode)
 	  && UMIPS_12BIT_OFFSET_P (INTVAL (addr.offset)));
 }
 
+/* Return true if X is a legitimate address with a 9-bit offset.
+   MODE is the mode of the value being accessed.  */
+
+bool
+mips_9bit_offset_address_p (rtx x, machine_mode mode)
+{
+  struct mips_address_info addr;
+
+  return (mips_classify_address (&addr, x, mode, false)
+	  && addr.type == ADDRESS_REG
+	  && CONST_INT_P (addr.offset)
+	  && MIPS_9BIT_OFFSET_P (INTVAL (addr.offset)));
+}
+
 /* Return the number of instructions needed to load constant X,
    assuming that BASE_INSN_LENGTH is the length of one instruction.
    Return 0 if X isn't a valid constant.  */
@@ -2613,8 +2847,12 @@ mips_const_insns (rtx x)
 
       return mips_build_integer (codes, INTVAL (x));
 
-    case CONST_DOUBLE:
     case CONST_VECTOR:
+      if (TARGET_MSA
+	  && mips_const_vector_same_int_p (x, GET_MODE (x), -512, 511))
+	return 1;
+      /* Fall through.  */
+    case CONST_DOUBLE:
       /* Allow zeros for normal mode, where we can use $0.  */
       return !TARGET_MIPS16 && x == CONST0_RTX (GET_MODE (x)) ? 1 : 0;
 
@@ -2674,6 +2912,26 @@ mips_split_const_insns (rtx x)
   return low + high;
 }
 
+/* Return one word of 128-bit value OP, taking into account the fixed
+   endianness of certain registers.  BYTE selects from the byte address.  */
+
+rtx
+mips_subword_at_byte (rtx op, unsigned int byte)
+{
+  machine_mode mode;
+
+  mode = GET_MODE (op);
+  if (mode == VOIDmode)
+    mode = TImode;
+
+  gcc_assert (!FP_REG_RTX_P (op));
+
+  if (MEM_P (op))
+    return mips_rewrite_small_data (adjust_address (op, word_mode, byte));
+
+  return simplify_gen_subreg (word_mode, op, mode, byte);
+}
+
 /* Return the number of instructions needed to implement INSN,
    given that it loads from or stores to MEM.  Assume that
    BASE_INSN_LENGTH is the length of one instruction.  */
@@ -2681,7 +2939,7 @@ mips_split_const_insns (rtx x)
 int
 mips_load_store_insns (rtx mem, rtx insn)
 {
-  enum machine_mode mode;
+  machine_mode mode;
   bool might_split_p;
   rtx set;
 
@@ -2721,6 +2979,17 @@ mips_idiv_insns (void)
     count++;
   return count;
 }
+
+/* Return the number of instructions needed for an MSA integer division.  */
+
+int
+mips_msa_idiv_insns (void)
+{
+  if (TARGET_CHECK_ZERO_DIV)
+    return 3;
+  else
+    return 1;
+}
 
 /* Emit a move from SRC to DEST.  Assume that the move expanders can
    handle all moves if !can_create_pseudo_p ().  The distinction is
@@ -2761,7 +3030,7 @@ mips_emit_unary (enum rtx_code code, rtx target, rtx op0)
    Return that new register.  */
 
 static rtx
-mips_force_unary (enum machine_mode mode, enum rtx_code code, rtx op0)
+mips_force_unary (machine_mode mode, enum rtx_code code, rtx op0)
 {
   rtx reg;
 
@@ -2783,7 +3052,7 @@ mips_emit_binary (enum rtx_code code, rtx target, rtx op0, rtx op1)
    of mode MODE.  Return that new register.  */
 
 static rtx
-mips_force_binary (enum machine_mode mode, enum rtx_code code, rtx op0, rtx op1)
+mips_force_binary (machine_mode mode, enum rtx_code code, rtx op0, rtx op1)
 {
   rtx reg;
 
@@ -3031,7 +3300,7 @@ mips_got_load (rtx temp, rtx addr, enum mips_symbol_type type)
    is guaranteed to be a legitimate address for mode MODE.  */
 
 bool
-mips_split_symbol (rtx temp, rtx addr, enum machine_mode mode, rtx *low_out)
+mips_split_symbol (rtx temp, rtx addr, machine_mode mode, rtx *low_out)
 {
   enum mips_symbol_context context;
   enum mips_symbol_type symbol_type;
@@ -3291,7 +3560,7 @@ mips16_expand_set_fcsr (rtx newval)
 /* If X is not a valid address for mode MODE, force it into a register.  */
 
 static rtx
-mips_force_address (rtx x, enum machine_mode mode)
+mips_force_address (rtx x, machine_mode mode)
 {
   if (!mips_legitimate_address_p (mode, x, false))
     x = force_reg (Pmode, x);
@@ -3305,7 +3574,7 @@ mips_force_address (rtx x, enum machine_mode mode)
 
 static rtx
 mips_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
-			 enum machine_mode mode)
+			 machine_mode mode)
 {
   rtx base, addr;
   HOST_WIDE_INT offset;
@@ -3336,7 +3605,7 @@ void
 mips_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
 {
   struct mips_integer_op codes[MIPS_MAX_INTEGER_OPS];
-  enum machine_mode mode;
+  machine_mode mode;
   unsigned int i, num_ops;
   rtx x;
 
@@ -3366,7 +3635,7 @@ mips_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
    move_operand.  */
 
 static void
-mips_legitimize_const_move (enum machine_mode mode, rtx dest, rtx src)
+mips_legitimize_const_move (machine_mode mode, rtx dest, rtx src)
 {
   rtx base, offset;
 
@@ -3417,12 +3686,16 @@ mips_legitimize_const_move (enum machine_mode mode, rtx dest, rtx src)
    sequence that is valid.  */
 
 bool
-mips_legitimize_move (enum machine_mode mode, rtx dest, rtx src)
+mips_legitimize_move (machine_mode mode, rtx dest, rtx src)
 {
-  if (!register_operand (dest, mode) && !reg_or_0_operand (src, mode))
+  if (!register_operand (dest, mode) && !register_operand (src, mode))
     {
-      mips_emit_move (dest, force_reg (mode, src));
-      return true;
+      if (TARGET_MIPS16 || !const_0_operand (src, mode)
+	  || MSA_SUPPORTED_MODE_P (mode))
+      {
+	mips_emit_move (dest, force_reg (mode, src));
+	return true;
+      }
     }
 
   /* We need to deal with constants that would be legitimate
@@ -3663,7 +3936,7 @@ mips_binary_cost (rtx x, int single_cost, int double_cost, bool speed)
 /* Return the cost of floating-point multiplications of mode MODE.  */
 
 static int
-mips_fp_mult_cost (enum machine_mode mode)
+mips_fp_mult_cost (machine_mode mode)
 {
   return mode == DFmode ? mips_cost->fp_mult_df : mips_cost->fp_mult_sf;
 }
@@ -3671,7 +3944,7 @@ mips_fp_mult_cost (enum machine_mode mode)
 /* Return the cost of floating-point divisions of mode MODE.  */
 
 static int
-mips_fp_div_cost (enum machine_mode mode)
+mips_fp_div_cost (machine_mode mode)
 {
   return mode == DFmode ? mips_cost->fp_div_df : mips_cost->fp_div_sf;
 }
@@ -3680,7 +3953,7 @@ mips_fp_div_cost (enum machine_mode mode)
    cost of OP itself.  */
 
 static int
-mips_sign_extend_cost (enum machine_mode mode, rtx op)
+mips_sign_extend_cost (machine_mode mode, rtx op)
 {
   if (MEM_P (op))
     /* Extended loads are as cheap as unextended ones.  */
@@ -3702,7 +3975,7 @@ mips_sign_extend_cost (enum machine_mode mode, rtx op)
    cost of OP itself.  */
 
 static int
-mips_zero_extend_cost (enum machine_mode mode, rtx op)
+mips_zero_extend_cost (machine_mode mode, rtx op)
 {
   if (MEM_P (op))
     /* Extended loads are as cheap as unextended ones.  */
@@ -3728,7 +4001,7 @@ mips_zero_extend_cost (enum machine_mode mode, rtx op)
    assuming that the move will be in pieces of at most UNITS bytes.  */
 
 static int
-mips_set_reg_reg_piece_cost (enum machine_mode mode, unsigned int units)
+mips_set_reg_reg_piece_cost (machine_mode mode, unsigned int units)
 {
   return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
 }
@@ -3736,7 +4009,7 @@ mips_set_reg_reg_piece_cost (enum machine_mode mode, unsigned int units)
 /* Return the cost of moving between two registers of mode MODE.  */
 
 static int
-mips_set_reg_reg_cost (enum machine_mode mode)
+mips_set_reg_reg_cost (machine_mode mode)
 {
   switch (GET_MODE_CLASS (mode))
     {
@@ -3761,7 +4034,7 @@ static bool
 mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 		int *total, bool speed)
 {
-  enum machine_mode mode = GET_MODE (x);
+  machine_mode mode = GET_MODE (x);
   bool float_mode_p = FLOAT_MODE_P (mode);
   int cost;
   rtx addr;
@@ -3980,6 +4253,10 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
     case NE:
     case UNORDERED:
     case LTGT:
+    case UNGE:
+    case UNGT:
+    case UNLE:
+    case UNLT:
       /* Branch comparisons have VOIDmode, so use the first operand's
 	 mode instead.  */
       mode = GET_MODE (XEXP (x, 0));
@@ -4036,6 +4313,22 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 	  return false;
 	}
 
+      /* If it's an add + mult (which is equivalent to shift left) and
+         it's immediate operand satisfies const_immlsa_operand predicate.  */
+      if (((ISA_HAS_LSA && mode == SImode)
+	   || (ISA_HAS_DLSA && mode == DImode))
+	  && GET_CODE (XEXP (x, 0)) == MULT)
+	{
+	  rtx op2 = XEXP (XEXP (x, 0), 1);
+	  if (const_immlsa_operand (op2, mode))
+	    {
+	      *total = (COSTS_N_INSNS (1)
+			+ set_src_cost (XEXP (XEXP (x, 0), 0), speed)
+			+ set_src_cost (XEXP (x, 1), speed));
+	      return true;
+	    }
+	}
+
       /* Double-word operations require three single-word operations and
 	 an SLTU.  The MIPS16 version then needs to move the result of
 	 the SLTU from $24 to a MIPS16 register.  */
@@ -4071,6 +4364,11 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 	*total = COSTS_N_INSNS (GET_MODE_SIZE (mode) > UNITS_PER_WORD ? 4 : 1);
       return false;
 
+    case FMA:
+      if (ISA_HAS_FP_MADDF_MSUBF)
+	*total = mips_fp_mult_cost (mode);
+      return false;
+
     case MULT:
       if (float_mode_p)
 	*total = mips_fp_mult_cost (mode);
@@ -4081,7 +4379,7 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 		  ? mips_cost->int_mult_si * 3 + 6
 		  : COSTS_N_INSNS (ISA_HAS_MUL3 ? 7 : 9));
       else if (!speed)
-	*total = COSTS_N_INSNS (ISA_HAS_MUL3 ? 1 : 2) + 1;
+	*total = COSTS_N_INSNS ((ISA_HAS_MUL3 || ISA_HAS_R6MUL) ? 1 : 2) + 1;
       else if (mode == DImode)
 	*total = mips_cost->int_mult_di;
       else
@@ -4133,6 +4431,10 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 	      return true;
 	    }
 	  *total = COSTS_N_INSNS (mips_idiv_insns ());
+	  if (MSA_SUPPORTED_MODE_P (mode))
+	    *total = COSTS_N_INSNS (mips_msa_idiv_insns ());
+	  else
+	    *total = COSTS_N_INSNS (mips_idiv_insns ());
 	}
       else if (mode == DImode)
         *total = mips_cost->int_div_di;
@@ -4157,6 +4459,52 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 	}
       *total = mips_zero_extend_cost (mode, XEXP (x, 0));
       return false;
+    case TRUNCATE:
+      /* Costings for highpart multiplies.  Matching patterns of the form:
+
+	 (lshiftrt:DI (mult:DI (sign_extend:DI (...)
+			       (sign_extend:DI (...))
+		      (const_int 32)
+      */
+      if (ISA_HAS_R6MUL
+	  && (GET_CODE (XEXP (x, 0)) == ASHIFTRT
+	      || GET_CODE (XEXP (x, 0)) == LSHIFTRT)
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+	  && ((INTVAL (XEXP (XEXP (x, 0), 1)) == 32
+	       && GET_MODE (XEXP (x, 0)) == DImode)
+	      || (ISA_HAS_R6DMUL
+		  && INTVAL (XEXP (XEXP (x, 0), 1)) == 64
+		  && GET_MODE (XEXP (x, 0)) == TImode))
+	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	  && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
+	       && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)
+	      || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
+		  && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))
+		      == ZERO_EXTEND))))
+	{
+	  if (!speed)
+	    *total = COSTS_N_INSNS (1) + 1;
+	  else if (mode == DImode)
+	    *total = mips_cost->int_mult_di;
+	  else
+	    *total = mips_cost->int_mult_si;
+
+	  /* Sign extension is free, zero extension costs for DImode when
+	     on a 64bit core / when DMUL is present.  */
+	  for (int i = 0; i < 2; ++i)
+	    {
+	      rtx op = XEXP (XEXP (XEXP (x, 0), 0), i);
+	      if (ISA_HAS_R6DMUL
+		  && GET_CODE (op) == ZERO_EXTEND
+		  && GET_MODE (op) == DImode)
+		*total += rtx_cost (op, MULT, i, speed);
+	      else
+		*total += rtx_cost (XEXP (op, 0), GET_CODE (op), 0, speed);
+	    }
+
+	  return true;
+	}
+      return false;
 
     case FLOAT:
     case UNSIGNED_FLOAT:
@@ -4183,7 +4531,7 @@ mips_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
 /* Implement TARGET_ADDRESS_COST.  */
 
 static int
-mips_address_cost (rtx addr, enum machine_mode mode,
+mips_address_cost (rtx addr, machine_mode mode,
 		   addr_space_t as ATTRIBUTE_UNUSED,
 		   bool speed ATTRIBUTE_UNUSED)
 {
@@ -4318,7 +4666,7 @@ rtx
 mips_subword (rtx op, bool high_p)
 {
   unsigned int byte, offset;
-  enum machine_mode mode;
+  machine_mode mode;
 
   mode = GET_MODE (op);
   if (mode == VOIDmode)
@@ -4383,6 +4731,10 @@ mips_split_move_p (rtx dest, rtx src, enum mips_split_type split_type)
 	return false;
     }
 
+  /* Check if MSA moves need splitting.  */
+  if (MSA_SUPPORTED_MODE_P (GET_MODE (dest)))
+    return mips_split_128bit_move_p (dest, src);
+
   /* Otherwise split all multiword moves.  */
   return size > UNITS_PER_WORD;
 }
@@ -4396,7 +4748,9 @@ mips_split_move (rtx dest, rtx src, enum mips_split_type split_type)
   rtx low_dest;
 
   gcc_checking_assert (mips_split_move_p (dest, src, split_type));
-  if (FP_REG_RTX_P (dest) || FP_REG_RTX_P (src))
+  if (MSA_SUPPORTED_MODE_P (GET_MODE (dest)))
+    mips_split_128bit_move (dest, src);
+  else if (FP_REG_RTX_P (dest) || FP_REG_RTX_P (src))
     {
       if (!TARGET_64BIT && GET_MODE (dest) == DImode)
 	emit_insn (gen_move_doubleword_fprdi (dest, src));
@@ -4469,6 +4823,195 @@ mips_insn_split_type (rtx insn)
   return SPLIT_IF_NECESSARY;
 }
 
+/* Return true if a 128-bit move from SRC to DEST should be split.  */
+
+bool
+mips_split_128bit_move_p (rtx dest, rtx src)
+{
+  /* MSA-to-MSA moves can be done in a single instruction.  */
+  if (FP_REG_RTX_P (src) && FP_REG_RTX_P (dest))
+    return false;
+
+  /* Check for MSA loads and stores.  */
+  if (FP_REG_RTX_P (dest) && MEM_P (src))
+    return false;
+  if (FP_REG_RTX_P (src) && MEM_P (dest))
+    return false;
+
+  /* Check for MSA set to an immediate const vector with valid replicated
+     element.  */
+  if (FP_REG_RTX_P (dest)
+      && mips_const_vector_same_int_p (src, GET_MODE (src), -512, 511))
+    return false;
+
+  return true;
+}
+
+/* Split a 128-bit move from SRC to DEST.  */
+
+void
+mips_split_128bit_move (rtx dest, rtx src)
+{
+  int byte, index;
+  rtx low_dest, low_src, d, s;
+
+  if (FP_REG_RTX_P (dest))
+    {
+      gcc_assert (!MEM_P (src));
+
+      rtx new_dest = dest;
+      if (!TARGET_64BIT)
+	{
+	  if (GET_MODE (dest) != V4SImode)
+	    new_dest = simplify_gen_subreg (V4SImode, dest, GET_MODE (dest), 0);
+	}
+      else
+	{
+	  if (GET_MODE (dest) != V2DImode)
+	    new_dest = simplify_gen_subreg (V2DImode, dest, GET_MODE (dest), 0);
+	}
+
+      for (byte = 0, index = 0; byte < GET_MODE_SIZE (TImode);
+	   byte += UNITS_PER_WORD, index++)
+	{
+	  s = mips_subword_at_byte (src, byte);
+	  if (!TARGET_64BIT)
+	    emit_insn (gen_msa_insert_w (new_dest, new_dest, GEN_INT (index),
+					 s));
+	  else
+	    emit_insn (gen_msa_insert_d (new_dest, new_dest, GEN_INT (index),
+					 s));
+	}
+    }
+  else if (FP_REG_RTX_P (src))
+    {
+      gcc_assert (!MEM_P (dest));
+
+      rtx new_src = src;
+      if (!TARGET_64BIT)
+	{
+	  if (GET_MODE (src) != V4SImode)
+	    new_src = simplify_gen_subreg (V4SImode, src, GET_MODE (src), 0);
+	}
+      else
+	{
+	  if (GET_MODE (src) != V2DImode)
+	    new_src = simplify_gen_subreg (V2DImode, src, GET_MODE (src), 0);
+	}
+
+      for (byte = 0, index = 0; byte < GET_MODE_SIZE (TImode);
+	   byte += UNITS_PER_WORD, index++)
+	{
+	  d = mips_subword_at_byte (dest, byte);
+	  if (!TARGET_64BIT)
+	    emit_insn (gen_msa_copy_s_w (d, new_src, GEN_INT (index)));
+	  else
+	    emit_insn (gen_msa_copy_s_d (d, new_src, GEN_INT (index)));
+	}
+    }
+  else
+    {
+      low_dest = mips_subword_at_byte (dest, 0);
+      low_src = mips_subword_at_byte (src, 0);
+      gcc_assert (REG_P (low_dest) && REG_P (low_src));
+      /* Make sure the source register is not written before reading.  */
+      if (REGNO (low_dest) <= REGNO (low_src))
+	{
+	  for (byte = 0; byte < GET_MODE_SIZE (TImode);
+	       byte += UNITS_PER_WORD)
+	    {
+	      d = mips_subword_at_byte (dest, byte);
+	      s = mips_subword_at_byte (src, byte);
+	      mips_emit_move (d, s);
+	    }
+	}
+      else
+	{
+	  for (byte = GET_MODE_SIZE (TImode) - UNITS_PER_WORD; byte >= 0;
+	       byte -= UNITS_PER_WORD)
+	    {
+	      d = mips_subword_at_byte (dest, byte);
+	      s = mips_subword_at_byte (src, byte);
+	      mips_emit_move (d, s);
+	    }
+	}
+    }
+}
+
+/* Split a COPY_S.D with operands DEST, SRC and INDEX.  GEN is a function
+   used to generate subregs.  */
+
+void
+mips_split_msa_copy_d (rtx dest, rtx src, rtx index,
+		       rtx (*gen_fn)(rtx, rtx, rtx))
+{
+  gcc_assert ((GET_MODE (src) == V2DImode && GET_MODE (dest) == DImode)
+	      || (GET_MODE (src) == V2DFmode && GET_MODE (dest) == DFmode));
+
+  /* Note that low is always from the lower index, and high is always
+     from the higher index.  */
+  rtx low = mips_subword (dest, false);
+  rtx high = mips_subword (dest, true);
+  rtx new_src = simplify_gen_subreg (V4SImode, src, GET_MODE (src), 0);
+
+  emit_insn (gen_fn (low, new_src, GEN_INT (INTVAL (index) * 2)));
+  emit_insn (gen_fn (high, new_src, GEN_INT (INTVAL (index) * 2 + 1)));
+}
+
+/* Split a INSERT.D with operand DEST, SRC1.INDEX and SRC2.  */
+
+void
+mips_split_msa_insert_d (rtx dest, rtx src1, rtx index, rtx src2)
+{
+  int i;
+  gcc_assert (GET_MODE (dest) == GET_MODE (src1));
+  gcc_assert ((GET_MODE (dest) == V2DImode
+	       && (GET_MODE (src2) == DImode || src2 == const0_rtx))
+	      || (GET_MODE (dest) == V2DFmode && GET_MODE (src2) == DFmode));
+
+  /* Note that low is always from the lower index, and high is always
+     from the higher index.  */
+  rtx low = mips_subword (src2, false);
+  rtx high = mips_subword (src2, true);
+  rtx new_dest = simplify_gen_subreg (V4SImode, dest, GET_MODE (dest), 0);
+  rtx new_src1 = simplify_gen_subreg (V4SImode, src1, GET_MODE (src1), 0);
+  i = exact_log2 (INTVAL (index));
+  gcc_assert (i != -1);
+
+  emit_insn (gen_msa_insert_w (new_dest, new_src1,
+			       GEN_INT (i * 2), low));
+  emit_insn (gen_msa_insert_w (new_dest, new_dest,
+			       GEN_INT (i * 2 + 1), high));
+}
+
+/* Split fill.d.  */
+
+void
+mips_split_msa_fill_d (rtx dest, rtx src)
+{
+  gcc_assert ((GET_MODE (dest) == V2DImode
+	       && (GET_MODE (src) == DImode || src == const0_rtx))
+	      || (GET_MODE (dest) == V2DFmode && GET_MODE (src) == DFmode));
+
+  /* Note that low is always from the lower index, and high is always
+     from the higher index.  */
+  rtx low, high;
+  if (src == const0_rtx)
+    {
+      low = src;
+      high = src;
+    }
+  else
+    {
+      low = mips_subword (src, false);
+      high = mips_subword (src, true);
+    }
+  rtx new_dest = simplify_gen_subreg (V4SImode, dest, GET_MODE (dest), 0);
+  emit_insn (gen_msa_fill_w (new_dest, low));
+  emit_insn (gen_msa_insert_w (new_dest, new_dest, const1_rtx, high));
+  emit_insn (gen_msa_insert_w (new_dest, new_dest, GEN_INT (3), high));
+}
+
 /* Return true if a move from SRC to DEST in INSN should be split.  */
 
 bool
@@ -4492,19 +5035,25 @@ mips_split_move_insn (rtx dest, rtx src, rtx insn)
 const char *
 mips_output_move (rtx dest, rtx src)
 {
-  enum rtx_code dest_code, src_code;
-  enum machine_mode mode;
+  enum rtx_code dest_code = GET_CODE (dest);
+  enum rtx_code src_code = GET_CODE (src);
+  machine_mode mode = GET_MODE (dest);
+  bool dbl_p = (GET_MODE_SIZE (mode) == 8);
+  bool msa_p = MSA_SUPPORTED_MODE_P (mode);
   enum mips_symbol_type symbol_type;
-  bool dbl_p;
-
-  dest_code = GET_CODE (dest);
-  src_code = GET_CODE (src);
-  mode = GET_MODE (dest);
-  dbl_p = (GET_MODE_SIZE (mode) == 8);
 
   if (mips_split_move_p (dest, src, SPLIT_IF_NECESSARY))
     return "#";
 
+  if (msa_p
+      && dest_code == REG && FP_REG_P (REGNO (dest))
+      && src_code == CONST_VECTOR
+      && CONST_INT_P (CONST_VECTOR_ELT (src, 0)))
+    {
+      gcc_assert (const_yi_operand (src, mode));
+      return "ldi.%v0\t%w0,%E1";
+    }
+
   if ((src_code == REG && GP_REG_P (REGNO (src)))
       || (!TARGET_MIPS16 && src == CONST0_RTX (mode)))
     {
@@ -4523,7 +5072,12 @@ mips_output_move (rtx dest, rtx src)
 
 	  /* Moves to HI are handled by special .md insns.  */
 	  if (REGNO (dest) == LO_REGNUM)
-	    return "mtlo\t%z1";
+	    {
+	      if (ISA_HAS_MULT)
+		return "mtlo\t%z1";
+	      else
+		return "mtlo\t%z1,$ac0";
+	    }
 
 	  if (DSP_ACC_REG_P (REGNO (dest)))
 	    {
@@ -4535,7 +5089,15 @@ mips_output_move (rtx dest, rtx src)
 	    }
 
 	  if (FP_REG_P (REGNO (dest)))
-	    return dbl_p ? "dmtc1\t%z1,%0" : "mtc1\t%z1,%0";
+	    {
+	      if (msa_p)
+		{
+		  gcc_assert (src == CONST0_RTX (GET_MODE (src)));
+		  return "ldi.%v0\t%w0,0";
+		}
+
+	      return dbl_p ? "dmtc1\t%z1,%0" : "mtc1\t%z1,%0";
+	    }
 
 	  if (ALL_COP_REG_P (REGNO (dest)))
 	    {
@@ -4552,6 +5114,7 @@ mips_output_move (rtx dest, rtx src)
 	  case 2: return "sh\t%z1,%0";
 	  case 4: return "sw\t%z1,%0";
 	  case 8: return "sd\t%z1,%0";
+	  default: gcc_unreachable ();
 	  }
     }
   if (dest_code == REG && GP_REG_P (REGNO (dest)))
@@ -4567,7 +5130,10 @@ mips_output_move (rtx dest, rtx src)
 		 -mfix-vr4130.  */
 	      if (ISA_HAS_MACCHI)
 		return dbl_p ? "dmacc\t%0,%.,%." : "macc\t%0,%.,%.";
-	      return "mflo\t%0";
+	      if (ISA_HAS_MULT)
+		return "mflo\t%0";
+	      else
+		return "mflo\t%0,$ac0";
 	    }
 
 	  if (DSP_ACC_REG_P (REGNO (src)))
@@ -4580,7 +5146,10 @@ mips_output_move (rtx dest, rtx src)
 	    }
 
 	  if (FP_REG_P (REGNO (src)))
-	    return dbl_p ? "dmfc1\t%0,%1" : "mfc1\t%0,%1";
+	    {
+	      gcc_assert (!msa_p);
+	      return dbl_p ? "dmfc1\t%0,%1" : "mfc1\t%0,%1";
+	    }
 
 	  if (ALL_COP_REG_P (REGNO (src)))
 	    {
@@ -4598,6 +5167,7 @@ mips_output_move (rtx dest, rtx src)
 	  case 2: return "lhu\t%0,%1";
 	  case 4: return "lw\t%0,%1";
 	  case 8: return "ld\t%0,%1";
+	  default: gcc_unreachable ();
 	  }
 
       if (src_code == CONST_INT)
@@ -4644,17 +5214,29 @@ mips_output_move (rtx dest, rtx src)
 	{
 	  if (GET_MODE (dest) == V2SFmode)
 	    return "mov.ps\t%0,%1";
+	  else if (msa_p)
+	    return "move.v\t%w0,%w1";
 	  else
 	    return dbl_p ? "mov.d\t%0,%1" : "mov.s\t%0,%1";
 	}
 
       if (dest_code == MEM)
-	return dbl_p ? "sdc1\t%1,%0" : "swc1\t%1,%0";
+	{
+	  if (msa_p)
+	    return "st.%v1\t%w1,%0";
+
+	  return dbl_p ? "sdc1\t%1,%0" : "swc1\t%1,%0";
+	}
     }
   if (dest_code == REG && FP_REG_P (REGNO (dest)))
     {
       if (src_code == MEM)
-	return dbl_p ? "ldc1\t%0,%1" : "lwc1\t%0,%1";
+	{
+	  if (msa_p)
+	    return "ld.%v0\t%w0,%1";
+
+	  return dbl_p ? "ldc1\t%0,%1" : "lwc1\t%0,%1";
+	}
     }
   if (dest_code == REG && ALL_COP_REG_P (REGNO (dest)) && src_code == MEM)
     {
@@ -4714,7 +5296,7 @@ mips_int_order_operand_ok_p (enum rtx_code code, rtx cmp1)
 
 static bool
 mips_canonicalize_int_order_test (enum rtx_code *code, rtx *cmp1,
-				  enum machine_mode mode)
+				  machine_mode mode)
 {
   HOST_WIDE_INT plus_one;
 
@@ -4759,7 +5341,7 @@ static void
 mips_emit_int_order_test (enum rtx_code code, bool *invert_ptr,
 			  rtx target, rtx cmp0, rtx cmp1)
 {
-  enum machine_mode mode;
+  machine_mode mode;
 
   /* First see if there is a MIPS instruction that can do this operation.
      If not, try doing the same for the inverse operation.  If that also
@@ -4863,7 +5445,7 @@ mips_reversed_fp_cond (enum rtx_code *code)
    a simple round-robin allocation scheme.  */
 
 static rtx
-mips_allocate_fcc (enum machine_mode mode)
+mips_allocate_fcc (machine_mode mode)
 {
   unsigned int regno, count;
 
@@ -4896,7 +5478,8 @@ mips_allocate_fcc (enum machine_mode mode)
    conditions are:
 
       - EQ or NE between two registers.
-      - any comparison between a register and zero.  */
+      - any comparison between a register and zero.
+      - if mips R6 then any of the conditional branches are valid.  */
 
 static void
 mips_emit_compare (enum rtx_code *code, rtx *op0, rtx *op1, bool need_eq_ne_p)
@@ -4918,6 +5501,44 @@ mips_emit_compare (enum rtx_code *code, rtx *op0, rtx *op1, bool need_eq_ne_p)
 	  else
 	    *op1 = force_reg (GET_MODE (cmp_op0), cmp_op1);
 	}
+      else if (!need_eq_ne_p && TARGET_CB_MAYBE)
+	{
+	  bool swap = false;
+	  switch (*code)
+	    {
+	    case LE:
+	      swap = true;
+	      *code = GE;
+	      break;
+	    case GT:
+	      swap = true;
+	      *code = LT;
+	      break;
+	    case LEU:
+	      swap = true;
+	      *code = GEU;
+	      break;
+	    case GTU:
+	      swap = true;
+	      *code = LTU;
+	      break;
+	    case GE:
+	    case LT:
+	    case GEU:
+	    case LTU:
+	      /* Do nothing.  */
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  *op1 = force_reg (GET_MODE (cmp_op0), cmp_op1);
+	  if (swap)
+	    {
+	      rtx tmp = *op1;
+	      *op1 = *op0;
+	      *op0 = tmp;
+	    }
+	}
       else
 	{
 	  /* The comparison needs a separate scc instruction.  Store the
@@ -4940,17 +5561,32 @@ mips_emit_compare (enum rtx_code *code, rtx *op0, rtx *op1, bool need_eq_ne_p)
     {
       enum rtx_code cmp_code;
 
-      /* Floating-point tests use a separate C.cond.fmt comparison to
-	 set a condition code register.  The branch or conditional move
-	 will then compare that register against zero.
+      /* Floating-point tests use a separate C.cond.fmt or CMP.cond.fmt
+	 comparison to set a register.  The branch or conditional move will
+	 then compare that register against zero.
 
 	 Set CMP_CODE to the code of the comparison instruction and
 	 *CODE to the code that the branch or move should use.  */
       cmp_code = *code;
-      *code = mips_reversed_fp_cond (&cmp_code) ? EQ : NE;
-      *op0 = (ISA_HAS_8CC
-	      ? mips_allocate_fcc (CCmode)
-	      : gen_rtx_REG (CCmode, FPSW_REGNUM));
+      if (ISA_HAS_CCF)
+	{
+	  /* All FP conditions can be implemented directly with CMP.cond.fmt
+	     or by reversing the operands.  */
+	  *code = NE;
+	  *op0 = gen_reg_rtx (CCFmode);
+	}
+      else
+	{
+	  /* Three FP conditions cannot be implemented by reversing the
+	     operands for C.cond.fmt, instead a reversed condition code is
+	     required and a test for false.  */
+	  *code = mips_reversed_fp_cond (&cmp_code) ? EQ : NE;
+	  if (ISA_HAS_8CC)
+	    *op0 = mips_allocate_fcc (CCmode);
+	  else
+	    *op0 = gen_rtx_REG (CCmode, FPSW_REGNUM);
+	}
+
       *op1 = const0_rtx;
       mips_emit_binary (cmp_code, *op0, cmp_op0, cmp_op1);
     }
@@ -5003,6 +5639,30 @@ mips_expand_conditional_branch (rtx *operands)
   emit_jump_insn (gen_condjump (condition, operands[3]));
 }
 
+/* Generate RTL to test OPERAND[1].  The test is specified by GEN_FN,
+   then sets OPERANDS[0] to 1 or 0 if the test is true/false respectively
+   according to GEN_FN.  */
+
+void
+mips_expand_msa_branch (rtx *operands, rtx (*gen_fn)(rtx, rtx, rtx))
+{
+  rtx labelT = gen_label_rtx ();
+  rtx labelE = gen_label_rtx ();
+  rtx tmp = gen_fn (labelT, operands[1], const0_rtx);
+
+  tmp = emit_jump_insn (tmp);
+  JUMP_LABEL (tmp) = labelT;
+  emit_move_insn (operands[0], const0_rtx);
+  tmp = emit_jump_insn (gen_jump (labelE));
+  emit_barrier ();
+  JUMP_LABEL (tmp) = labelE;
+  emit_label (labelT);
+  LABEL_NUSES (labelT) = 1;
+  emit_move_insn (operands[0], const1_rtx);
+  emit_label (labelE);
+  LABEL_NUSES (labelE) = 1;
+}
+
 /* Implement:
 
    (set temp (COND:CCV2 CMP_OP0 CMP_OP1))
@@ -5040,9 +5700,45 @@ mips_expand_conditional_move (rtx *operands)
 
   mips_emit_compare (&code, &op0, &op1, true);
   cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
-			  gen_rtx_IF_THEN_ELSE (GET_MODE (operands[0]), cond,
-						operands[2], operands[3])));
+
+  /* There is no direct support for general conditional GP move involving
+     two registers using SEL.  */
+  if (ISA_HAS_SEL
+      && INTEGRAL_MODE_P (GET_MODE (operands[2]))
+      && register_operand (operands[2], VOIDmode)
+      && register_operand (operands[3], VOIDmode))
+    {
+      machine_mode mode = GET_MODE (operands[0]);
+      rtx temp = gen_reg_rtx (mode);
+      rtx temp2 = gen_reg_rtx (mode);
+
+      emit_insn (gen_rtx_SET (VOIDmode, temp,
+			      gen_rtx_IF_THEN_ELSE (mode, cond,
+						    operands[2], const0_rtx)));
+
+      /* Flip the test for the second operand.  */
+      cond = gen_rtx_fmt_ee ((code == EQ) ? NE : EQ, GET_MODE (op0), op0, op1);
+
+      emit_insn (gen_rtx_SET (VOIDmode, temp2,
+			      gen_rtx_IF_THEN_ELSE (mode, cond,
+						    operands[3], const0_rtx)));
+
+      /* Merge the two results, at least one is guaranteed to be zero.  */
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			      gen_rtx_IOR (mode, temp, temp2)));
+    }
+  else
+    {
+      if (FLOAT_MODE_P (GET_MODE (operands[2])) && !ISA_HAS_SEL)
+	{
+	  operands[2] = force_reg (GET_MODE (operands[0]), operands[2]);
+	  operands[3] = force_reg (GET_MODE (operands[0]), operands[3]);
+	}
+
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			      gen_rtx_IF_THEN_ELSE (GET_MODE (operands[0]), cond,
+						    operands[2], operands[3])));
+    }
 }
 
 /* Perform the comparison in COMPARISON, then trap if the condition holds.  */
@@ -5051,7 +5747,7 @@ void
 mips_expand_conditional_trap (rtx comparison)
 {
   rtx op0, op1;
-  enum machine_mode mode;
+  machine_mode mode;
   enum rtx_code code;
 
   /* MIPS conditional trap instructions don't have GT or LE flavors,
@@ -5076,7 +5772,9 @@ mips_expand_conditional_trap (rtx comparison)
 
   mode = GET_MODE (XEXP (comparison, 0));
   op0 = force_reg (mode, op0);
-  if (!arith_operand (op1, mode))
+  if (!(ISA_HAS_COND_TRAPI
+	? arith_operand (op1, mode)
+	: reg_or_0_operand (op1, mode)))
     op1 = force_reg (mode, op1);
 
   emit_insn (gen_rtx_TRAP_IF (VOIDmode,
@@ -5101,7 +5799,7 @@ mips_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype)
 
 static void
 mips_get_arg_info (struct mips_arg_info *info, const CUMULATIVE_ARGS *cum,
-		   enum machine_mode mode, const_tree type, bool named)
+		   machine_mode mode, const_tree type, bool named)
 {
   bool doubleword_aligned_p;
   unsigned int num_bytes, num_words, max_regs;
@@ -5130,6 +5828,7 @@ mips_get_arg_info (struct mips_arg_info *info, const CUMULATIVE_ARGS *cum,
       /* Only leading floating-point scalars are passed in
 	 floating-point registers.  We also handle vector floats the same
 	 say, which is OK because they are not covered by the standard ABI.  */
+      gcc_assert (TARGET_PAIRED_SINGLE_FLOAT || mode != V2SFmode);
       info->fpr_p = (!cum->gp_reg_found
 		     && cum->arg_number < 2
 		     && (type == 0
@@ -5145,6 +5844,7 @@ mips_get_arg_info (struct mips_arg_info *info, const CUMULATIVE_ARGS *cum,
       /* Scalar, complex and vector floating-point types are passed in
 	 floating-point registers, as long as this is a named rather
 	 than a variable argument.  */
+      gcc_assert (TARGET_PAIRED_SINGLE_FLOAT || mode != V2SFmode);
       info->fpr_p = (named
 		     && (type == 0 || FLOAT_TYPE_P (type))
 		     && (GET_MODE_CLASS (mode) == MODE_FLOAT
@@ -5238,7 +5938,7 @@ mips_strict_argument_naming (cumulative_args_t ca ATTRIBUTE_UNUSED)
 /* Implement TARGET_FUNCTION_ARG.  */
 
 static rtx
-mips_function_arg (cumulative_args_t cum_v, enum machine_mode mode,
+mips_function_arg (cumulative_args_t cum_v, machine_mode mode,
 		   const_tree type, bool named)
 {
   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
@@ -5251,7 +5951,7 @@ mips_function_arg (cumulative_args_t cum_v, enum machine_mode mode,
   if (mode == VOIDmode)
     {
       if (TARGET_MIPS16 && cum->fp_code != 0)
-	return gen_rtx_REG ((enum machine_mode) cum->fp_code, 0);
+	return gen_rtx_REG ((machine_mode) cum->fp_code, 0);
       else
 	return NULL;
     }
@@ -5334,7 +6034,7 @@ mips_function_arg (cumulative_args_t cum_v, enum machine_mode mode,
       && GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
     {
       rtx real, imag;
-      enum machine_mode inner;
+      machine_mode inner;
       unsigned int regno;
 
       inner = GET_MODE_INNER (mode);
@@ -5365,7 +6065,7 @@ mips_function_arg (cumulative_args_t cum_v, enum machine_mode mode,
 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
 
 static void
-mips_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
+mips_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
 			   const_tree type, bool named)
 {
   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
@@ -5403,7 +6103,7 @@ mips_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
 
 static int
 mips_arg_partial_bytes (cumulative_args_t cum,
-			enum machine_mode mode, tree type, bool named)
+			machine_mode mode, tree type, bool named)
 {
   struct mips_arg_info info;
 
@@ -5416,7 +6116,7 @@ mips_arg_partial_bytes (cumulative_args_t cum,
    to STACK_BOUNDARY bits if the type requires it.  */
 
 static unsigned int
-mips_function_arg_boundary (enum machine_mode mode, const_tree type)
+mips_function_arg_boundary (machine_mode mode, const_tree type)
 {
   unsigned int alignment;
 
@@ -5428,13 +6128,23 @@ mips_function_arg_boundary (enum machine_mode mode, const_tree type)
   return alignment;
 }
 
+/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
+
+static machine_mode
+mips_get_reg_raw_mode (int regno)
+{
+  if (TARGET_FLOATXX && FP_REG_P (regno))
+    return DFmode;
+  return default_get_reg_raw_mode (regno);
+}
+
 /* Return true if FUNCTION_ARG_PADDING (MODE, TYPE) should return
    upward rather than downward.  In other words, return true if the
    first byte of the stack slot has useful data, false if the last
    byte does.  */
 
 bool
-mips_pad_arg_upward (enum machine_mode mode, const_tree type)
+mips_pad_arg_upward (machine_mode mode, const_tree type)
 {
   /* On little-endian targets, the first byte of every stack argument
      is passed in the first byte of the stack slot.  */
@@ -5472,7 +6182,7 @@ mips_pad_arg_upward (enum machine_mode mode, const_tree type)
    the opposite if the most significant byte does.  */
 
 bool
-mips_pad_reg_upward (enum machine_mode mode, tree type)
+mips_pad_reg_upward (machine_mode mode, tree type)
 {
   /* No shifting is required for floating-point arguments.  */
   if (type != 0 ? FLOAT_TYPE_P (type) : GET_MODE_CLASS (mode) == MODE_FLOAT)
@@ -5487,7 +6197,7 @@ mips_pad_reg_upward (enum machine_mode mode, tree type)
 
 static bool
 mips_pass_by_reference (cumulative_args_t cum ATTRIBUTE_UNUSED,
-			enum machine_mode mode, const_tree type,
+			machine_mode mode, const_tree type,
 			bool named ATTRIBUTE_UNUSED)
 {
   if (mips_abi == ABI_EABI)
@@ -5514,7 +6224,7 @@ mips_pass_by_reference (cumulative_args_t cum ATTRIBUTE_UNUSED,
 
 static bool
 mips_callee_copies (cumulative_args_t cum ATTRIBUTE_UNUSED,
-		    enum machine_mode mode ATTRIBUTE_UNUSED,
+		    machine_mode mode ATTRIBUTE_UNUSED,
 		    const_tree type ATTRIBUTE_UNUSED, bool named)
 {
   return mips_abi == ABI_EABI && named;
@@ -5583,8 +6293,9 @@ mips_return_in_msb (const_tree valtype)
    floating-point register.  */
 
 static bool
-mips_return_mode_in_fpr_p (enum machine_mode mode)
+mips_return_mode_in_fpr_p (machine_mode mode)
 {
+  gcc_assert (TARGET_PAIRED_SINGLE_FLOAT || mode != V2SFmode);
   return ((GET_MODE_CLASS (mode) == MODE_FLOAT
 	   || mode == V2SFmode
 	   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
@@ -5602,8 +6313,8 @@ mips_return_mode_in_fpr_p (enum machine_mode mode)
    the structure itself has mode BLKmode.  */
 
 static rtx
-mips_return_fpr_single (enum machine_mode type_mode,
-			enum machine_mode value_mode)
+mips_return_fpr_single (machine_mode type_mode,
+			machine_mode value_mode)
 {
   rtx x;
 
@@ -5625,13 +6336,13 @@ mips_return_fpr_single (enum machine_mode type_mode,
    Otherwise the values are packed together as closely as possible.  */
 
 static rtx
-mips_return_fpr_pair (enum machine_mode mode,
-		      enum machine_mode mode1, HOST_WIDE_INT offset1,
-		      enum machine_mode mode2, HOST_WIDE_INT offset2)
+mips_return_fpr_pair (machine_mode mode,
+		      machine_mode mode1, HOST_WIDE_INT offset1,
+		      machine_mode mode2, HOST_WIDE_INT offset2)
 {
   int inc;
 
-  inc = (TARGET_NEWABI ? 2 : MAX_FPRS_PER_FMT);
+  inc = (TARGET_NEWABI || mips_abi == ABI_32 ? 2 : MAX_FPRS_PER_FMT);
   return gen_rtx_PARALLEL
     (mode,
      gen_rtvec (2,
@@ -5650,7 +6361,7 @@ mips_return_fpr_pair (enum machine_mode mode,
 
 static rtx
 mips_function_value_1 (const_tree valtype, const_tree fn_decl_or_type,
-		       enum machine_mode mode)
+		       machine_mode mode)
 {
   if (valtype)
     {
@@ -5740,26 +6451,38 @@ mips_function_value (const_tree valtype, const_tree fn_decl_or_type,
 /* Implement TARGET_LIBCALL_VALUE.  */
 
 static rtx
-mips_libcall_value (enum machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED)
+mips_libcall_value (machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED)
 {
   return mips_function_value_1 (NULL_TREE, NULL_TREE, mode);
 }
 
 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
 
-   On the MIPS, R2 R3 and F0 F2 are the only register thus used.
-   Currently, R2 and F0 are only implemented here (C has no complex type).  */
+   On the MIPS, R2 R3 and F0 F2 are the only register thus used.  */
 
 static bool
 mips_function_value_regno_p (const unsigned int regno)
 {
+  /* Most types only require one GPR or one FPR for return values but for
+     hard-float two FPRs can be used for _Complex types (for all ABIs)
+     and long doubles (for n64).  */
   if (regno == GP_RETURN
       || regno == FP_RETURN
-      || (LONG_DOUBLE_TYPE_SIZE == 128
-	  && FP_RETURN != GP_RETURN
+      || (FP_RETURN != GP_RETURN
 	  && regno == FP_RETURN + 2))
     return true;
 
+  /* For o32 FP32, _Complex double will be returned in four 32-bit registers.
+     This does not apply to o32 FPXX as floating-point function argument and
+     return registers are described as 64-bit even though floating-point
+     registers are primarily described as 32-bit internally.
+     See: mips_get_reg_raw_mode.  */
+  if ((mips_abi == ABI_32 && TARGET_FLOAT32)
+      && FP_RETURN != GP_RETURN
+      && (regno == FP_RETURN + 1
+	  || regno == FP_RETURN + 3))
+    return true;
+
   return false;
 }
 
@@ -5780,7 +6503,7 @@ mips_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
 
 static void
-mips_setup_incoming_varargs (cumulative_args_t cum, enum machine_mode mode,
+mips_setup_incoming_varargs (cumulative_args_t cum, machine_mode mode,
 			     tree type, int *pretend_size ATTRIBUTE_UNUSED,
 			     int no_rtl)
 {
@@ -5819,7 +6542,7 @@ mips_setup_incoming_varargs (cumulative_args_t cum, enum machine_mode mode,
 	{
 	  /* We can't use move_block_from_reg, because it will use
 	     the wrong mode.  */
-	  enum machine_mode mode;
+	  machine_mode mode;
 	  int off, i;
 
 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
@@ -6468,7 +7191,7 @@ static struct mips16_stub *mips16_stubs;
    return mode MODE in the name of a MIPS16 function stub.  */
 
 static const char *
-mips16_call_stub_mode_suffix (enum machine_mode mode)
+mips16_call_stub_mode_suffix (machine_mode mode)
 {
   if (mode == SFmode)
     return "sf";
@@ -6479,7 +7202,10 @@ mips16_call_stub_mode_suffix (enum machine_mode mode)
   else if (mode == DCmode)
     return "dc";
   else if (mode == V2SFmode)
-    return "df";
+    {
+      gcc_assert (TARGET_PAIRED_SINGLE_FLOAT);
+      return "df";
+    }
   else
     gcc_unreachable ();
 }
@@ -6503,13 +7229,27 @@ mips_output_64bit_xfer (char direction, unsigned int gpreg, unsigned int fpreg)
   if (TARGET_64BIT)
     fprintf (asm_out_file, "\tdm%cc1\t%s,%s\n", direction,
  	     reg_names[gpreg], reg_names[fpreg]);
-  else if (TARGET_FLOAT64)
+  else if (ISA_HAS_MXHC1)
     {
       fprintf (asm_out_file, "\tm%cc1\t%s,%s\n", direction,
  	       reg_names[gpreg + TARGET_BIG_ENDIAN], reg_names[fpreg]);
       fprintf (asm_out_file, "\tm%chc1\t%s,%s\n", direction,
  	       reg_names[gpreg + TARGET_LITTLE_ENDIAN], reg_names[fpreg]);
     }
+  else if (TARGET_FLOATXX && direction == 't')
+    {
+      /* Use the argument save area to move via memory.  */
+      fprintf (asm_out_file, "\tsw\t%s,0($sp)\n", reg_names[gpreg]);
+      fprintf (asm_out_file, "\tsw\t%s,4($sp)\n", reg_names[gpreg + 1]);
+      fprintf (asm_out_file, "\tldc1\t%s,0($sp)\n", reg_names[fpreg]);
+    }
+  else if (TARGET_FLOATXX && direction == 'f')
+    {
+      /* Use the argument save area to move via memory.  */
+      fprintf (asm_out_file, "\tsdc1\t%s,0($sp)\n", reg_names[fpreg]);
+      fprintf (asm_out_file, "\tlw\t%s,0($sp)\n", reg_names[gpreg]);
+      fprintf (asm_out_file, "\tlw\t%s,4($sp)\n", reg_names[gpreg + 1]);
+    }
   else
     {
       /* Move the least-significant word.  */
@@ -6539,7 +7279,7 @@ mips_output_args_xfer (int fp_code, char direction)
 
   for (f = (unsigned int) fp_code; f != 0; f >>= 2)
     {
-      enum machine_mode mode;
+      machine_mode mode;
       struct mips_arg_info info;
 
       if ((f & 3) == 1)
@@ -6664,7 +7404,7 @@ mips16_copy_fpr_return_value (void)
 {
   rtx fn, insn, retval;
   tree return_type;
-  enum machine_mode return_mode;
+  machine_mode return_mode;
   const char *name;
 
   return_type = DECL_RESULT (current_function_decl);
@@ -6905,7 +7645,7 @@ mips16_build_call_stub (rtx retval, rtx *fn_ptr, rtx args_size, int fp_code)
 	     $18 is usually a call-saved register.  */
 	  fprintf (asm_out_file, "\tmove\t%s,%s\n",
 		   reg_names[GP_REG_FIRST + 18], reg_names[RETURN_ADDR_REGNUM]);
-	  output_asm_insn (MIPS_CALL ("jal", &fn, 0, -1), &fn);
+	  output_asm_insn (mips_output_jump (&fn, 0, -1, true), &fn);
 	  fprintf (asm_out_file, "\t.cfi_register 31,18\n");
 
 	  /* Move the result from floating-point registers to
@@ -6915,11 +7655,11 @@ mips16_build_call_stub (rtx retval, rtx *fn_ptr, rtx args_size, int fp_code)
 	    case SCmode:
 	      mips_output_32bit_xfer ('f', GP_RETURN + TARGET_BIG_ENDIAN,
 				      TARGET_BIG_ENDIAN
-				      ? FP_REG_FIRST + MAX_FPRS_PER_FMT
+				      ? FP_REG_FIRST + 2
 				      : FP_REG_FIRST);
 	      mips_output_32bit_xfer ('f', GP_RETURN + TARGET_LITTLE_ENDIAN,
 				      TARGET_LITTLE_ENDIAN
-				      ? FP_REG_FIRST + MAX_FPRS_PER_FMT
+				      ? FP_REG_FIRST + 2
 				      : FP_REG_FIRST);
 	      if (GET_MODE (retval) == SCmode && TARGET_64BIT)
 		{
@@ -6948,10 +7688,12 @@ mips16_build_call_stub (rtx retval, rtx *fn_ptr, rtx args_size, int fp_code)
 
 	    case DCmode:
 	      mips_output_64bit_xfer ('f', GP_RETURN + (8 / UNITS_PER_WORD),
-				      FP_REG_FIRST + MAX_FPRS_PER_FMT);
+				      FP_REG_FIRST + 2);
 	      /* Fall though.  */
  	    case DFmode:
 	    case V2SFmode:
+	      gcc_assert (TARGET_PAIRED_SINGLE_FLOAT
+			  || GET_MODE (retval) != V2SFmode);
 	      mips_output_64bit_xfer ('f', GP_RETURN, FP_REG_FIRST);
 	      break;
 
@@ -7168,35 +7910,6 @@ mips_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
   return true;
 }
 
-/* Emit code to move general operand SRC into condition-code
-   register DEST given that SCRATCH is a scratch TFmode FPR.
-   The sequence is:
-
-	FP1 = SRC
-	FP2 = 0.0f
-	DEST = FP2 < FP1
-
-   where FP1 and FP2 are single-precision FPRs taken from SCRATCH.  */
-
-void
-mips_expand_fcc_reload (rtx dest, rtx src, rtx scratch)
-{
-  rtx fp1, fp2;
-
-  /* Change the source to SFmode.  */
-  if (MEM_P (src))
-    src = adjust_address (src, SFmode, 0);
-  else if (REG_P (src) || GET_CODE (src) == SUBREG)
-    src = gen_rtx_REG (SFmode, true_regnum (src));
-
-  fp1 = gen_rtx_REG (SFmode, REGNO (scratch));
-  fp2 = gen_rtx_REG (SFmode, REGNO (scratch) + MAX_FPRS_PER_FMT);
-
-  mips_emit_move (copy_rtx (fp1), src);
-  mips_emit_move (copy_rtx (fp2), CONST0_RTX (SFmode));
-  emit_insn (gen_slt_sf (dest, fp2, fp1));
-}
-
 /* Implement MOVE_BY_PIECES_P.  */
 
 bool
@@ -7245,7 +7958,7 @@ mips_store_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
 	  LW/SWL/SWR sequence.  This is often better than the 4 LIs and
 	  4 SBs that we would generate when storing by pieces.  */
   if (align <= BITS_PER_UNIT)
-    return size < 4;
+    return size < 4 || !ISA_HAS_LWL_LWR;
 
   /* If the data is 2-byte aligned, then:
 
@@ -7280,7 +7993,9 @@ mips_store_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
      (c4) A block move of 8 bytes can use two LW/SW sequences or a single
 	  LD/SD sequence, and in these cases we've traditionally preferred
 	  the memory copy over the more bulky constant moves.  */
-  return size < 8;
+  return (size < 8
+	  || (align < 4 * BITS_PER_UNIT
+	      && !ISA_HAS_LWL_LWR));
 }
 
 /* Emit straight-line code to move LENGTH bytes from SRC to DEST.
@@ -7292,7 +8007,7 @@ mips_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
   HOST_WIDE_INT offset, delta;
   unsigned HOST_WIDE_INT bits;
   int i;
-  enum machine_mode mode;
+  machine_mode mode;
   rtx *regs;
 
   /* Work out how many bits to move at a time.  If both operands have
@@ -7422,6 +8137,11 @@ mips_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
 bool
 mips_expand_block_move (rtx dest, rtx src, rtx length)
 {
+  if (!ISA_HAS_LWL_LWR
+       && (MEM_ALIGN (src) < BITS_PER_WORD
+	   || MEM_ALIGN (dest) < BITS_PER_WORD))
+    return false;
+
   if (CONST_INT_P (length))
     {
       if (INTVAL (length) <= MIPS_MAX_MOVE_BYTES_STRAIGHT)
@@ -7512,7 +8232,7 @@ mips_expand_atomic_qihi (union mips_gen_fn_ptrs generator,
   rtx orig_addr, memsi_addr, memsi, shift, shiftsi, unshifted_mask;
   rtx unshifted_mask_reg, mask, inverted_mask, si_op;
   rtx res = NULL;
-  enum machine_mode mode;
+  machine_mode mode;
 
   mode = GET_MODE (mem);
 
@@ -7697,7 +8417,7 @@ mips_expand_ins_as_unaligned_store (rtx dest, rtx src, HOST_WIDE_INT width,
 				    HOST_WIDE_INT bitpos)
 {
   rtx left, right;
-  enum machine_mode mode;
+  machine_mode mode;
 
   if (!mips_get_unaligned_mem (dest, width, bitpos, &left, &right))
     return false;
@@ -7720,7 +8440,7 @@ mips_expand_ins_as_unaligned_store (rtx dest, rtx src, HOST_WIDE_INT width,
 /* Return true if X is a MEM with the same size as MODE.  */
 
 bool
-mips_mem_fits_mode_p (enum machine_mode mode, rtx x)
+mips_mem_fits_mode_p (machine_mode mode, rtx x)
 {
   return (MEM_P (x)
 	  && MEM_SIZE_KNOWN_P (x)
@@ -7762,7 +8482,7 @@ mips_use_ins_ext_p (rtx op, HOST_WIDE_INT width, HOST_WIDE_INT bitpos)
    mask_low_and_shift_len for the actual definition.  */
 
 bool
-mask_low_and_shift_p (enum machine_mode mode, rtx mask, rtx shift, int maxlen)
+mask_low_and_shift_p (machine_mode mode, rtx mask, rtx shift, int maxlen)
 {
   return IN_RANGE (mask_low_and_shift_len (mode, mask, shift), 1, maxlen);
 }
@@ -7772,7 +8492,7 @@ mask_low_and_shift_p (enum machine_mode mode, rtx mask, rtx shift, int maxlen)
    see the table in the comment before the pattern.  */
 
 bool
-and_operands_ok (enum machine_mode mode, rtx op1, rtx op2)
+and_operands_ok (machine_mode mode, rtx op1, rtx op2)
 {
   return (memory_operand (op1, mode)
 	  ? and_load_operand (op2, mode)
@@ -7786,7 +8506,7 @@ and_operands_ok (enum machine_mode mode, rtx op1, rtx op2)
    return the length of the mask, otherwise return -1.  */
 
 int
-mask_low_and_shift_len (enum machine_mode mode, rtx mask, rtx shift)
+mask_low_and_shift_len (machine_mode mode, rtx mask, rtx shift)
 {
   HOST_WIDE_INT shval;
 
@@ -8031,7 +8751,7 @@ mips_pop_asm_switch (struct mips_asm_switch *asm_switch)
    '!'  Print "s" to use the short version if the delay slot contains a
 	16-bit instruction.
 
-   See also mips_init_print_operand_pucnt.  */
+   See also mips_init_print_operand_punct.  */
 
 static void
 mips_print_operand_punctuation (FILE *file, int ch)
@@ -8115,7 +8835,8 @@ mips_print_operand_punctuation (FILE *file, int ch)
 
     case ':':
       /* When final_sequence is 0, the delay slot will be a nop.  We can
-	 use the compact version for microMIPS.  */
+	 use the compact version where available.  The %: formatter will
+	 only be present if a compact form of the branch is available.  */
       if (final_sequence == 0)
 	putc ('c', file);
       break;
@@ -8123,8 +8844,9 @@ mips_print_operand_punctuation (FILE *file, int ch)
     case '!':
       /* If the delay slot instruction is short, then use the
 	 compact version.  */
-      if (final_sequence == 0
-	  || get_attr_length (XVECEXP (final_sequence, 0, 1)) == 2)
+      if (TARGET_MICROMIPS && !TARGET_INTERLINK_COMPRESSED && mips_isa_rev <= 5
+	  && (final_sequence == 0
+	      || get_attr_length (XVECEXP (final_sequence, 0, 1)) == 2))
 	putc ('s', file);
       break;
 
@@ -8183,11 +8905,17 @@ mips_print_float_branch_condition (FILE *file, enum rtx_code code, int letter)
   switch (code)
     {
     case EQ:
-      fputs ("c1f", file);
+      if (ISA_HAS_CCF)
+	fputs ("c1eqz", file);
+      else
+	fputs ("c1f", file);
       break;
 
     case NE:
-      fputs ("c1t", file);
+      if (ISA_HAS_CCF)
+	fputs ("c1nez", file);
+      else
+	fputs ("c1t", file);
       break;
 
     default:
@@ -8206,10 +8934,13 @@ mips_print_operand_punct_valid_p (unsigned char code)
 
 /* Implement TARGET_PRINT_OPERAND.  The MIPS-specific operand codes are:
 
+   'E'  Print CONST_INT_OP element 0 of a replicated CONST_VECTOR in decimal.
    'X'	Print CONST_INT OP in hexadecimal format.
    'x'	Print the low 16 bits of CONST_INT OP in hexadecimal format.
    'd'	Print CONST_INT OP in decimal.
+   'B'	Print CONST_INT as an unsigned byte [0..255].
    'm'	Print one less than CONST_INT OP in decimal.
+   'y'	Print exact log2 of CONST_INT OP in decimal.
    'h'	Print the high-part relocation associated with OP, after stripping
 	  any outermost HIGH.
    'R'	Print the low-part relocation associated with OP.
@@ -8217,6 +8948,7 @@ mips_print_operand_punct_valid_p (unsigned char code)
    'N'	Print the inverse of the integer branch condition for comparison OP.
    'F'	Print the FPU branch condition for comparison OP.
    'W'	Print the inverse of the FPU branch condition for comparison OP.
+   'w'	Print a MSA register.
    'T'	Print 'f' for (eq:CC ...), 't' for (ne:CC ...),
 	      'z' for (eq:?I ...), 'n' for (ne:?I ...).
    't'	Like 'T', but with the EQ/NE cases reversed
@@ -8227,7 +8959,9 @@ mips_print_operand_punct_valid_p (unsigned char code)
    'L'	Print the low-order register in a double-word register operand.
    'M'	Print high-order register in a double-word register operand.
    'z'	Print $0 if OP is zero, otherwise print OP normally.
-   'b'	Print the address of a memory operand, without offset.  */
+   'b'	Print the address of a memory operand, without offset.
+   'v'	Print the insn size suffix b,h,w,d,f or d for vector modes V16QI,V8HI,V4SI,
+	  V2SI,V4DF and V2DF.  */
 
 static void
 mips_print_operand (FILE *file, rtx op, int letter)
@@ -8245,6 +8979,18 @@ mips_print_operand (FILE *file, rtx op, int letter)
 
   switch (letter)
     {
+    case 'E':
+      if (GET_CODE (op) == CONST_VECTOR)
+	{
+	  gcc_assert (mips_const_vector_same_val_p (op, GET_MODE (op)));
+	  op = CONST_VECTOR_ELT (op, 0);
+	  gcc_assert (CONST_INT_P (op));
+	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (op));
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
     case 'X':
       if (CONST_INT_P (op))
 	fprintf (file, HOST_WIDE_INT_PRINT_HEX, INTVAL (op));
@@ -8266,6 +9012,46 @@ mips_print_operand (FILE *file, rtx op, int letter)
 	output_operand_lossage ("invalid use of '%%%c'", letter);
       break;
 
+    case 'B':
+      if (CONST_INT_P (op))
+	{
+	  HOST_WIDE_INT val = INTVAL (op);
+	  if (val < 0)
+	    {
+	      gcc_assert (val >= -128);
+	      val += 256;
+	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
+	    }
+	  else
+	    {
+	      gcc_assert (val <= 255);
+	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
+	    }
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'K':
+      if (CONST_INT_P (op))
+	{
+	  int val = INTVAL (op);
+	  int i;
+	  for (i = 0; i < 16; i++)
+	    {
+	      if ((val & (1 << i)) == val)
+		{
+		  fprintf (file, "%d", i);
+		  break;
+		}
+	    }
+	  if (i == 16)
+	    output_operand_lossage ("invalid use of '%%%c' - Mask inappropriate", letter);
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
     case 'm':
       if (CONST_INT_P (op))
 	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (op) - 1);
@@ -8273,6 +9059,19 @@ mips_print_operand (FILE *file, rtx op, int letter)
 	output_operand_lossage ("invalid use of '%%%c'", letter);
       break;
 
+    case 'y':
+      if (CONST_INT_P (op))
+	{
+	  int val = exact_log2 (INTVAL (op));
+	  if (val != -1)
+	    fprintf (file, "%d", val);
+	  else
+	    output_operand_lossage ("invalid use of '%%%c'", letter);
+	}
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
     case 'h':
       if (code == HIGH)
 	op = XEXP (op, 0);
@@ -8317,7 +9116,7 @@ mips_print_operand (FILE *file, rtx op, int letter)
       break;
 
     case 'Z':
-      if (ISA_HAS_8CC)
+      if (ISA_HAS_8CC || ISA_HAS_CCF)
 	{
 	  mips_print_operand (file, op, 0);
 	  fputc (',', file);
@@ -8333,6 +9132,39 @@ mips_print_operand (FILE *file, rtx op, int letter)
 	output_operand_lossage ("invalid use of '%%%c'", letter);
       break;
 
+    case 'w':
+      if (code == REG && MSA_REG_P (REGNO (op)))
+	fprintf (file, "$w%s", &reg_names[REGNO (op)][2]);
+      else
+	output_operand_lossage ("invalid use of '%%%c'", letter);
+      break;
+
+    case 'v':
+      switch (GET_MODE (op))
+	{
+	case V16QImode:
+	  fprintf (file, "b");
+	  break;
+	case V8HImode:
+	  fprintf (file, "h");
+	  break;
+	case V4SImode:
+	  fprintf (file, "w");
+	  break;
+	case V2DImode:
+	  fprintf (file, "d");
+	  break;
+	case V4SFmode:
+	  fprintf (file, "w");
+	  break;
+	case V2DFmode:
+	  fprintf (file, "d");
+	  break;
+	default:
+	  output_operand_lossage ("invalid use of '%%%c'", letter);
+	}
+      break;
+
     default:
       switch (code)
 	{
@@ -8436,7 +9268,7 @@ mips_encode_section_info (tree decl, rtx rtl, int first)
 /* Implement TARGET_SELECT_RTX_SECTION.  */
 
 static section *
-mips_select_rtx_section (enum machine_mode mode, rtx x,
+mips_select_rtx_section (machine_mode mode, rtx x,
 			 unsigned HOST_WIDE_INT align)
 {
   /* ??? Consider using mergeable small data sections.  */
@@ -8684,17 +9516,31 @@ static rtx
 mips_dwarf_register_span (rtx reg)
 {
   rtx high, low;
-  enum machine_mode mode;
-
-  /* By default, GCC maps increasing register numbers to increasing
-     memory locations, but paired FPRs are always little-endian,
-     regardless of the prevailing endianness.  */
+  machine_mode mode;
+
+  /* TARGET_FLOATXX is implemented as 32-bit floating-point registers but
+     ensures that double-precision registers are treated as if they were
+     64-bit physical registers.  The code will run correctly with 32-bit or
+     64-bit registers which means that dwarf information cannot be precise
+     for all scenarios.  We choose to state that the 64-bit values are stored
+     in a single 64-bit 'piece'.  This slightly unusual construct can then be
+     interpreted as either a pair of registers if the registers are 32-bit or
+     a single 64-bit register depending on hardware.  */
   mode = GET_MODE (reg);
   if (FP_REG_P (REGNO (reg))
-      && TARGET_BIG_ENDIAN
-      && MAX_FPRS_PER_FMT > 1
+      && TARGET_FLOATXX
       && GET_MODE_SIZE (mode) > UNITS_PER_FPREG)
     {
+      return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, reg));
+    }
+  /* By default, GCC maps increasing register numbers to increasing
+     memory locations, but paired FPRs are always little-endian,
+     regardless of the prevailing endianness.  */
+  else if (FP_REG_P (REGNO (reg))
+	   && TARGET_BIG_ENDIAN
+	   && MAX_FPRS_PER_FMT > 1
+	   && GET_MODE_SIZE (mode) > UNITS_PER_FPREG)
+    {
       gcc_assert (GET_MODE_SIZE (mode) == UNITS_PER_HWFPVALUE);
       high = mips_subword (reg, true);
       low = mips_subword (reg, false);
@@ -8704,6 +9550,19 @@ mips_dwarf_register_span (rtx reg)
   return NULL_RTX;
 }
 
+/* Implement TARGET_DWARF_FRAME_REG_MODE.  */
+
+static machine_mode
+mips_dwarf_frame_reg_mode (int regno)
+{
+  machine_mode mode = default_dwarf_frame_reg_mode (regno);
+
+  if (FP_REG_P (regno) && mips_abi == ABI_32 && TARGET_FLOAT64)
+    mode = SImode;
+
+  return mode;
+}
+
 /* DSP ALU can bypass data with no delays for the following pairs. */
 enum insn_code dspalu_bypass_table[][2] =
 {
@@ -8983,6 +9842,31 @@ mips_file_start (void)
     fprintf (asm_out_file, "\t.nan\t%s\n",
 	     mips_nan == MIPS_IEEE_754_2008 ? "2008" : "legacy");
 
+#ifdef HAVE_AS_DOT_MODULE
+  /* Record the FP ABI.  See below for comments.  */
+  if (TARGET_NO_FLOAT)
+#ifdef HAVE_AS_GNU_ATTRIBUTE
+    fputs ("\t.gnu_attribute 4, 0\n", asm_out_file);
+#else
+    ;
+#endif
+  else if (!TARGET_HARD_FLOAT_ABI)
+    fputs ("\t.module\tsoftfloat\n", asm_out_file);
+  else if (!TARGET_DOUBLE_FLOAT)
+    fputs ("\t.module\tsinglefloat\n", asm_out_file);
+  else if (TARGET_FLOATXX)
+    fputs ("\t.module\tfp=xx\n", asm_out_file);
+  else if (TARGET_FLOAT64)
+    fputs ("\t.module\tfp=64\n", asm_out_file);
+  else
+    fputs ("\t.module\tfp=32\n", asm_out_file);
+
+  if (TARGET_ODD_SPREG)
+    fputs ("\t.module\toddspreg\n", asm_out_file);
+  else
+    fputs ("\t.module\tnooddspreg\n", asm_out_file);
+
+#else
 #ifdef HAVE_AS_GNU_ATTRIBUTE
   {
     int attr;
@@ -8996,16 +9880,31 @@ mips_file_start (void)
     /* Single-float code, -msingle-float.  */
     else if (!TARGET_DOUBLE_FLOAT)
       attr = 2;
-    /* 64-bit FP registers on a 32-bit target, -mips32r2 -mfp64.  */
-    else if (!TARGET_64BIT && TARGET_FLOAT64)
-      attr = 4;
+    /* 64-bit FP registers on a 32-bit target, -mips32r2 -mfp64.
+       Reserved attr=4.
+       This case used 12 callee-saved double-precision registers
+       and is deprecated.  */
+    /* 64-bit or 32-bit FP registers on a 32-bit target, -mfpxx.  */
+    else if (TARGET_FLOATXX)
+      attr = 5;
+    /* 64-bit FP registers on a 32-bit target, -mfp64 -modd-spreg.  */
+    else if (mips_abi == ABI_32 && TARGET_FLOAT64 && TARGET_ODD_SPREG)
+      attr = 6;
+    /* 64-bit FP registers on a 32-bit target, -mfp64 -mno-odd-spreg.  */
+    else if (mips_abi == ABI_32 && TARGET_FLOAT64)
+      attr = 7;
     /* Regular FP code, FP regs same size as GP regs, -mdouble-float.  */
     else
       attr = 1;
 
     fprintf (asm_out_file, "\t.gnu_attribute 4, %d\n", attr);
+
+    /* 128-bit MSA.  */
+    if (TARGET_MSA)
+      fprintf (asm_out_file, "\t.gnu_attribute 8, 1\n");
   }
 #endif
+#endif
 
   /* If TARGET_ABICALLS, tell GAS to generate -KPIC code.  */
   if (TARGET_ABICALLS)
@@ -9805,7 +10704,8 @@ mips_must_initialize_gp_p (void)
 static bool
 mips_interrupt_extra_call_saved_reg_p (unsigned int regno)
 {
-  if (MD_REG_P (regno))
+  if ((ISA_HAS_HILO || TARGET_DSP)
+      && MD_REG_P (regno))
     return true;
 
   if (TARGET_DSP && DSP_ACC_REG_P (regno))
@@ -10016,10 +10916,8 @@ mips_compute_frame_info (void)
   /* Set this function's interrupt properties.  */
   if (mips_interrupt_type_p (TREE_TYPE (current_function_decl)))
     {
-      if (!ISA_MIPS32R2)
-	error ("the %<interrupt%> attribute requires a MIPS32r2 processor");
-      else if (TARGET_HARD_FLOAT)
-	error ("the %<interrupt%> attribute requires %<-msoft-float%>");
+      if (mips_isa_rev < 2)
+	error ("the %<interrupt%> attribute requires a MIPS32r2 processor or greater");
       else if (TARGET_MIPS16)
 	error ("interrupt handlers cannot be MIPS16 functions");
       else
@@ -10451,7 +11349,7 @@ typedef void (*mips_save_restore_fn) (rtx, rtx);
    stack pointer.  */
 
 static void
-mips_save_restore_reg (enum machine_mode mode, int regno,
+mips_save_restore_reg (machine_mode mode, int regno,
 		       HOST_WIDE_INT offset, mips_save_restore_fn fn)
 {
   rtx mem;
@@ -10494,7 +11392,9 @@ mips_for_each_saved_acc (HOST_WIDE_INT sp_offset, mips_save_restore_fn fn)
 static void
 mips_save_reg (rtx reg, rtx mem)
 {
-  if (GET_MODE (reg) == DFmode && !TARGET_FLOAT64)
+  if (GET_MODE (reg) == DFmode
+      && (!TARGET_FLOAT64
+	  || mips_abi == ABI_32))
     {
       rtx x1, x2;
 
@@ -10617,7 +11517,7 @@ static void
 mips_for_each_saved_gpr_and_fpr (HOST_WIDE_INT sp_offset,
 				 mips_save_restore_fn fn)
 {
-  enum machine_mode fpr_mode;
+  machine_mode fpr_mode;
   int regno;
   const struct mips_frame_info *frame = &cfun->machine->frame;
   HOST_WIDE_INT offset;
@@ -10652,7 +11552,16 @@ mips_for_each_saved_gpr_and_fpr (HOST_WIDE_INT sp_offset,
        regno -= MAX_FPRS_PER_FMT)
     if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
       {
-	mips_save_restore_reg (fpr_mode, regno, offset, fn);
+	if (!TARGET_FLOAT64 && TARGET_DOUBLE_FLOAT
+	    && (fixed_regs[regno] || fixed_regs[regno + 1]))
+	  {
+	    if (fixed_regs[regno])
+	      mips_save_restore_reg (SFmode, regno + 1, offset, fn);
+	    else
+	      mips_save_restore_reg (SFmode, regno, offset, fn);
+	  }
+	else
+	  mips_save_restore_reg (fpr_mode, regno, offset, fn);
 	offset -= GET_MODE_SIZE (fpr_mode);
       }
 }
@@ -11247,6 +12156,14 @@ mips_expand_prologue (void)
 				       GEN_INT (5),
 				       GEN_INT (SR_IE),
 				       gen_rtx_REG (SImode, GP_REG_FIRST)));
+
+              if (TARGET_HARD_FLOAT)
+                /* Disable COP1 for hard-float. This will lead to an exception
+                   if floating-point code is executed in an ISR.  */
+		emit_insn (gen_insvsi (gen_rtx_REG (SImode, K1_REG_NUM),
+				       GEN_INT (1),
+				       GEN_INT (SR_COP1),
+				       gen_rtx_REG (SImode, GP_REG_FIRST)));
 	    }
 	  else
 	    {
@@ -11419,7 +12336,9 @@ mips_restore_reg (rtx reg, rtx mem)
      $7 instead and adjust the return insn appropriately.  */
   if (TARGET_MIPS16 && REGNO (reg) == RETURN_ADDR_REGNUM)
     reg = gen_rtx_REG (GET_MODE (reg), GP_REG_FIRST + 7);
-  else if (GET_MODE (reg) == DFmode && !TARGET_FLOAT64)
+  else if (GET_MODE (reg) == DFmode
+	   && (!TARGET_FLOAT64
+	       || mips_abi == ABI_32))
     {
       mips_add_cfa_restore (mips_subword (reg, true));
       mips_add_cfa_restore (mips_subword (reg, false));
@@ -11740,7 +12659,7 @@ mips_can_use_return_insn (void)
    The result of this function is cached in mips_hard_regno_mode_ok.  */
 
 static bool
-mips_hard_regno_mode_ok_p (unsigned int regno, enum machine_mode mode)
+mips_hard_regno_mode_ok_p (unsigned int regno, machine_mode mode)
 {
   unsigned int size;
   enum mode_class mclass;
@@ -11761,13 +12680,31 @@ mips_hard_regno_mode_ok_p (unsigned int regno, enum machine_mode mode)
   size = GET_MODE_SIZE (mode);
   mclass = GET_MODE_CLASS (mode);
 
-  if (GP_REG_P (regno))
+  if (GP_REG_P (regno) && mode != CCFmode && !MSA_SUPPORTED_MODE_P (mode))
     return ((regno - GP_REG_FIRST) & 1) == 0 || size <= UNITS_PER_WORD;
 
+  /* For MSA, allow TImode and 128-bit vector modes in all FPR.  */
+  if (FP_REG_P (regno) && MSA_SUPPORTED_MODE_P (mode))
+    return true;
+
   if (FP_REG_P (regno)
       && (((regno - FP_REG_FIRST) % MAX_FPRS_PER_FMT) == 0
 	  || (MIN_FPRS_PER_FMT == 1 && size <= UNITS_PER_FPREG)))
     {
+      /* Deny use of odd-numbered registers for 32-bit data for
+	 the o32 FP64A ABI.  */
+      if (TARGET_O32_FP64A_ABI && size <= 4 && (regno & 1) != 0)
+	return false;
+
+      /* Prevent the use of odd-numbered registers for CCFmode with the
+	 o32 FPXX ABI, otherwise allow them.
+	 The FPXX ABI does not permit double-precision data to be placed
+	 in odd-numbered registers and double-precision compares write
+	 them as 64-bit values.  Without this restriction the R6 FPXX
+	 ABI would not be able to execute in FR=1 FRE=1 mode.  */
+      if (mode == CCFmode && ISA_HAS_CCF)
+	return !(TARGET_FLOATXX && (regno & 1) != 0);
+
       /* Allow 64-bit vector modes for Loongson-2E/2F.  */
       if (TARGET_LOONGSON_VECTORS
 	  && (mode == V2SImode
@@ -11789,7 +12726,9 @@ mips_hard_regno_mode_ok_p (unsigned int regno, enum machine_mode mode)
 	return size >= MIN_UNITS_PER_WORD && size <= UNITS_PER_FPREG;
     }
 
+  /* Don't allow MSA vector modes in accumulators.  */
   if (ACC_REG_P (regno)
+      && !MSA_SUPPORTED_MODE_P (mode)
       && (INTEGRAL_MODE_P (mode) || ALL_FIXED_POINT_MODE_P (mode)))
     {
       if (MD_REG_P (regno))
@@ -11830,7 +12769,7 @@ mips_hard_regno_mode_ok_p (unsigned int regno, enum machine_mode mode)
 /* Implement HARD_REGNO_NREGS.  */
 
 unsigned int
-mips_hard_regno_nregs (int regno, enum machine_mode mode)
+mips_hard_regno_nregs (int regno, machine_mode mode)
 {
   if (ST_REG_P (regno))
     /* The size of FP status registers is always 4, because they only hold
@@ -11838,7 +12777,12 @@ mips_hard_regno_nregs (int regno, enum machine_mode mode)
     return (GET_MODE_SIZE (mode) + 3) / 4;
 
   if (FP_REG_P (regno))
-    return (GET_MODE_SIZE (mode) + UNITS_PER_FPREG - 1) / UNITS_PER_FPREG;
+    {
+      if (MSA_SUPPORTED_MODE_P (mode))
+	return 1;
+
+      return (GET_MODE_SIZE (mode) + UNITS_PER_FPREG - 1) / UNITS_PER_FPREG;
+    }
 
   /* All other registers are word-sized.  */
   return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
@@ -11848,7 +12792,7 @@ mips_hard_regno_nregs (int regno, enum machine_mode mode)
    in mips_hard_regno_nregs.  */
 
 int
-mips_class_max_nregs (enum reg_class rclass, enum machine_mode mode)
+mips_class_max_nregs (enum reg_class rclass, machine_mode mode)
 {
   int size;
   HARD_REG_SET left;
@@ -11858,13 +12802,25 @@ mips_class_max_nregs (enum reg_class rclass, enum machine_mode mode)
   if (hard_reg_set_intersect_p (left, reg_class_contents[(int) ST_REGS]))
     {
       if (HARD_REGNO_MODE_OK (ST_REG_FIRST, mode))
-	size = MIN (size, 4);
+	{
+	  if (MSA_SUPPORTED_MODE_P (mode))
+	    size = MIN (size, UNITS_PER_MSA_REG);
+	  else
+	    size = MIN (size, UNITS_PER_FPREG);
+	}
+
       AND_COMPL_HARD_REG_SET (left, reg_class_contents[(int) ST_REGS]);
     }
   if (hard_reg_set_intersect_p (left, reg_class_contents[(int) FP_REGS]))
     {
       if (HARD_REGNO_MODE_OK (FP_REG_FIRST, mode))
-	size = MIN (size, UNITS_PER_FPREG);
+	{
+	  if (MSA_SUPPORTED_MODE_P (mode))
+	    size = MIN (size, UNITS_PER_MSA_REG);
+	  else
+	    size = MIN (size, UNITS_PER_FPREG);
+	}
+
       AND_COMPL_HARD_REG_SET (left, reg_class_contents[(int) FP_REGS]);
     }
   if (!hard_reg_set_empty_p (left))
@@ -11875,8 +12831,8 @@ mips_class_max_nregs (enum reg_class rclass, enum machine_mode mode)
 /* Implement CANNOT_CHANGE_MODE_CLASS.  */
 
 bool
-mips_cannot_change_mode_class (enum machine_mode from,
-			       enum machine_mode to,
+mips_cannot_change_mode_class (machine_mode from,
+			       machine_mode to,
 			       enum reg_class rclass)
 {
   /* Allow conversions between different Loongson integer vectors,
@@ -11885,6 +12841,10 @@ mips_cannot_change_mode_class (enum machine_mode from,
       && INTEGRAL_MODE_P (from) && INTEGRAL_MODE_P (to))
     return false;
 
+  /* Allow conversions between different MSA vector modes and TImode.  */
+  if (MSA_SUPPORTED_MODE_P (from) && MSA_SUPPORTED_MODE_P (to))
+    return false;
+
   /* Otherwise, there are several problems with changing the modes of
      values in floating-point registers:
 
@@ -11914,19 +12874,21 @@ mips_cannot_change_mode_class (enum machine_mode from,
 /* Implement target hook small_register_classes_for_mode_p.  */
 
 static bool
-mips_small_register_classes_for_mode_p (enum machine_mode mode
+mips_small_register_classes_for_mode_p (machine_mode mode
 					ATTRIBUTE_UNUSED)
 {
   return TARGET_MIPS16;
 }
 
-/* Return true if moves in mode MODE can use the FPU's mov.fmt instruction.  */
+/* Return true if moves in mode MODE can use the FPU's mov.fmt instruction,
+   or use the MSA's move.v instruction.  */
 
 static bool
-mips_mode_ok_for_mov_fmt_p (enum machine_mode mode)
+mips_mode_ok_for_mov_fmt_p (machine_mode mode)
 {
   switch (mode)
     {
+    case CCFmode:
     case SFmode:
       return TARGET_HARD_FLOAT;
 
@@ -11937,14 +12899,14 @@ mips_mode_ok_for_mov_fmt_p (enum machine_mode mode)
       return TARGET_HARD_FLOAT && TARGET_PAIRED_SINGLE_FLOAT;
 
     default:
-      return false;
+      return MSA_SUPPORTED_MODE_P (mode);
     }
 }
 
 /* Implement MODES_TIEABLE_P.  */
 
 bool
-mips_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
+mips_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
   /* FPRs allow no mode punning, so it's not worth tying modes if we'd
      prefer to put one of them in FPRs.  */
@@ -11994,13 +12956,12 @@ mips_canonicalize_move_class (reg_class_t rclass)
   return rclass;
 }
 
-/* Return the cost of moving a value of mode MODE from a register of
-   class FROM to a GPR.  Return 0 for classes that are unions of other
-   classes handled by this function.  */
+/* Return the cost of moving a value from a register of class FROM to a GPR.
+   Return 0 for classes that are unions of other classes handled by this
+   function.  */
 
 static int
-mips_move_to_gpr_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
-		       reg_class_t from)
+mips_move_to_gpr_cost (machine_mode mode, reg_class_t from)
 {
   switch (from)
     {
@@ -12017,10 +12978,6 @@ mips_move_to_gpr_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
       /* MFC1, etc.  */
       return 4;
 
-    case ST_REGS:
-      /* LUI followed by MOVF.  */
-      return 4;
-
     case COP0_REGS:
     case COP2_REGS:
     case COP3_REGS:
@@ -12032,12 +12989,12 @@ mips_move_to_gpr_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
     }
 }
 
-/* Return the cost of moving a value of mode MODE from a GPR to a
-   register of class TO.  Return 0 for classes that are unions of
-   other classes handled by this function.  */
+/* Return the cost of moving a value from a GPR to a register of class TO.
+   Return 0 for classes that are unions of other classes handled by this
+   function.  */
 
 static int
-mips_move_from_gpr_cost (enum machine_mode mode, reg_class_t to)
+mips_move_from_gpr_cost (machine_mode mode, reg_class_t to)
 {
   switch (to)
     {
@@ -12054,11 +13011,6 @@ mips_move_from_gpr_cost (enum machine_mode mode, reg_class_t to)
       /* MTC1, etc.  */
       return 4;
 
-    case ST_REGS:
-      /* A secondary reload through an FPR scratch.  */
-      return (mips_register_move_cost (mode, GENERAL_REGS, FP_REGS)
-	      + mips_register_move_cost (mode, FP_REGS, ST_REGS));
-
     case COP0_REGS:
     case COP2_REGS:
     case COP3_REGS:
@@ -12075,7 +13027,7 @@ mips_move_from_gpr_cost (enum machine_mode mode, reg_class_t to)
    the maximum for us.  */
 
 static int
-mips_register_move_cost (enum machine_mode mode,
+mips_register_move_cost (machine_mode mode,
 			 reg_class_t from, reg_class_t to)
 {
   reg_class_t dregs;
@@ -12090,9 +13042,6 @@ mips_register_move_cost (enum machine_mode mode,
       if (to == FP_REGS && mips_mode_ok_for_mov_fmt_p (mode))
 	/* MOV.FMT.  */
 	return 4;
-      if (to == ST_REGS)
-	/* The sequence generated by mips_expand_fcc_reload.  */
-	return 8;
     }
 
   /* Handle cases in which only one class deviates from the ideal.  */
@@ -12114,14 +13063,45 @@ mips_register_move_cost (enum machine_mode mode,
   return 0;
 }
 
+/* Implement TARGET_REGISTER_PRIORITY.  */
+
+static int
+mips_register_priority (int hard_regno)
+{
+  /* Treat MIPS16 registers with higher priority than other regs.  */
+  if (TARGET_MIPS16
+      && TEST_HARD_REG_BIT (reg_class_contents[M16_REGS], hard_regno))
+    return 1;
+  return 0;
+}
+
 /* Implement TARGET_MEMORY_MOVE_COST.  */
 
 static int
-mips_memory_move_cost (enum machine_mode mode, reg_class_t rclass, bool in)
+mips_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in)
 {
   return (mips_cost->memory_latency
 	  + memory_move_secondary_cost (mode, rclass, in));
-} 
+}
+
+/* Implement SECONDARY_MEMORY_NEEDED.  */
+
+bool
+mips_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
+			      machine_mode mode)
+{
+  /* Ignore spilled pseudos.  */
+  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
+    return false;
+
+  if (((class1 == FP_REGS) != (class2 == FP_REGS))
+      && ((TARGET_FLOATXX && !ISA_HAS_MXHC1)
+	  || TARGET_O32_FP64A_ABI)
+      && GET_MODE_SIZE (mode) >= 8)
+    return true;
+
+  return false;
+}
 
 /* Return the register class required for a secondary register when
    copying between one of the registers in RCLASS and value X, which
@@ -12131,7 +13111,7 @@ mips_memory_move_cost (enum machine_mode mode, reg_class_t rclass, bool in)
 
 enum reg_class
 mips_secondary_reload_class (enum reg_class rclass,
-			     enum machine_mode mode, rtx x, bool in_p)
+			     machine_mode mode, rtx x, bool)
 {
   int regno;
 
@@ -12157,31 +13137,19 @@ mips_secondary_reload_class (enum reg_class rclass,
   if (ACC_REG_P (regno))
     return reg_class_subset_p (rclass, GR_REGS) ? NO_REGS : GR_REGS;
 
-  /* We can only copy a value to a condition code register from a
-     floating-point register, and even then we require a scratch
-     floating-point register.  We can only copy a value out of a
-     condition-code register into a general register.  */
-  if (reg_class_subset_p (rclass, ST_REGS))
-    {
-      if (in_p)
-	return FP_REGS;
-      return GP_REG_P (regno) ? NO_REGS : GR_REGS;
-    }
-  if (ST_REG_P (regno))
-    {
-      if (!in_p)
-	return FP_REGS;
-      return reg_class_subset_p (rclass, GR_REGS) ? NO_REGS : GR_REGS;
-    }
-
   if (reg_class_subset_p (rclass, FP_REGS))
     {
-      if (MEM_P (x)
-	  && (GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8))
+      if (regno < 0
+	  || (MEM_P (x)
+	      && (GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)))
 	/* In this case we can use lwc1, swc1, ldc1 or sdc1.  We'll use
 	   pairs of lwc1s and swc1s if ldc1 and sdc1 are not supported.  */
 	return NO_REGS;
 
+      if (MEM_P (x) && MSA_SUPPORTED_MODE_P (mode))
+	/* In this case we can use MSA LD.* and ST.*.  */
+	return NO_REGS;
+
       if (GP_REG_P (regno) || x == CONST0_RTX (mode))
 	/* In this case we can use mtc1, mfc1, dmtc1 or dmfc1.  */
 	return NO_REGS;
@@ -12208,7 +13176,7 @@ mips_secondary_reload_class (enum reg_class rclass,
 /* Implement TARGET_MODE_REP_EXTENDED.  */
 
 static int
-mips_mode_rep_extended (enum machine_mode mode, enum machine_mode mode_rep)
+mips_mode_rep_extended (machine_mode mode, machine_mode mode_rep)
 {
   /* On 64-bit targets, SImode register values are sign-extended to DImode.  */
   if (TARGET_64BIT && mode == SImode && mode_rep == DImode)
@@ -12220,7 +13188,7 @@ mips_mode_rep_extended (enum machine_mode mode, enum machine_mode mode_rep)
 /* Implement TARGET_VALID_POINTER_MODE.  */
 
 static bool
-mips_valid_pointer_mode (enum machine_mode mode)
+mips_valid_pointer_mode (machine_mode mode)
 {
   return mode == SImode || (TARGET_64BIT && mode == DImode);
 }
@@ -12228,7 +13196,7 @@ mips_valid_pointer_mode (enum machine_mode mode)
 /* Implement TARGET_VECTOR_MODE_SUPPORTED_P.  */
 
 static bool
-mips_vector_mode_supported_p (enum machine_mode mode)
+mips_vector_mode_supported_p (machine_mode mode)
 {
   switch (mode)
     {
@@ -12251,14 +13219,14 @@ mips_vector_mode_supported_p (enum machine_mode mode)
       return TARGET_LOONGSON_VECTORS;
 
     default:
-      return false;
+      return MSA_SUPPORTED_MODE_P (mode);
     }
 }
 
 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P.  */
 
 static bool
-mips_scalar_mode_supported_p (enum machine_mode mode)
+mips_scalar_mode_supported_p (machine_mode mode)
 {
   if (ALL_FIXED_POINT_MODE_P (mode)
       && GET_MODE_PRECISION (mode) <= 2 * BITS_PER_WORD)
@@ -12269,15 +13237,45 @@ mips_scalar_mode_supported_p (enum machine_mode mode)
 
 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.  */
 
-static enum machine_mode
-mips_preferred_simd_mode (enum machine_mode mode ATTRIBUTE_UNUSED)
+static machine_mode
+mips_preferred_simd_mode (machine_mode mode)
 {
   if (TARGET_PAIRED_SINGLE_FLOAT
       && mode == SFmode)
     return V2SFmode;
+
+  if (! TARGET_MSA)
+    return word_mode;
+
+  switch (mode)
+    {
+    case QImode:
+      return V16QImode;
+    case HImode:
+      return V8HImode;
+    case SImode:
+      return V4SImode;
+    case DImode:
+      return V2DImode;
+
+    case SFmode:
+      return V4SFmode;
+
+    case DFmode:
+      return V2DFmode;
+
+    default:
+      break;
+    }
   return word_mode;
 }
 
+static unsigned int
+mips_autovectorize_vector_sizes (void)
+{
+  return TARGET_MSA ? 16 : 0;
+}
+
 /* Implement TARGET_INIT_LIBFUNCS.  */
 
 static void
@@ -12474,6 +13472,7 @@ mips_adjust_insn_length (rtx insn, int length)
 	break;
 
       case HAZARD_DELAY:
+      case HAZARD_FORBIDDEN_SLOT:
 	length += NOP_INSN_LENGTH;
 	break;
 
@@ -12485,6 +13484,78 @@ mips_adjust_insn_length (rtx insn, int length)
   return length;
 }
 
+/* Return the asm template for a call.  OPERANDS are the operands, TARGET_OPNO
+   is the operand number of the target.  SIZE_OPNO is the operand number of
+   the argument size operand that can optionally hold the call attributes.  If
+   SIZE_OPNO is not -1 and the call is indirect, use the function symbol from
+   the call attributes to attach a R_MIPS_JALR relocation to the call.
+
+   When generating GOT code without explicit relocation operators, all calls
+   should use assembly macros.  Otherwise, all indirect calls should use "jr"
+   or "jalr"; we will arrange to restore $gp afterwards if necessary.  Finally,
+   we can only generate direct calls for -mabicalls by temporarily switching
+   to non-PIC mode.
+
+   For microMIPS jal(r), we try to generate jal(r)s when a 16-bit
+   instruction is in the delay slot of jal(r).
+
+   Where compact branches are available, we try to use them if the delay slot
+   has a NOP (or equivalently delay slots were not enabled for the instruction
+   anyway).  */
+
+const char *
+mips_output_jump (rtx *operands, int target_opno, int size_opno, bool link_p)
+{
+  static char buffer[300];
+  char *s = buffer;
+  bool reg_p = REG_P (operands[target_opno]);
+
+  const char *and_link = link_p ? "al" : "";
+  const char *reg = reg_p ? "r" : "";
+  const char *compact = "";
+  const char *nop = "%/";
+  const char *short_delay = link_p ? "%!" : "";
+  const char *insn_name = TARGET_CB_NEVER || reg_p ? "j" : "b";
+
+  /* Compact branches can only be described when the ISA has support for them
+     as both the compact formatter '%:' and the delay slot NOP formatter '%/'
+     work as a mutually exclusive pair.  I.e. a NOP is never required if a
+     compact form is available.  */
+  if (!final_sequence
+      && (TARGET_CB_MAYBE
+	  || (ISA_HAS_JRC && !link_p && reg_p)))
+    {
+      compact = "c";
+      nop = "";
+    }
+
+
+  if (TARGET_USE_GOT && !TARGET_EXPLICIT_RELOCS)
+    sprintf (s, "%%*%s%s\t%%%d%%/", insn_name, link_p ? "al" : "", target_opno);
+  else
+    {
+      if (!reg_p && TARGET_ABICALLS_PIC2)
+	s += sprintf (s, ".option\tpic0\n\t");
+
+      if (reg_p && mips_get_pic_call_symbol (operands, size_opno))
+	{
+	  s += sprintf (s, "%%*.reloc\t1f,R_MIPS_JALR,%%%d\n1:\t", size_opno);
+	  /* Not sure why this shouldn't permit a short delay but it did not
+	     allow it before so we still don't allow it.  */
+	  short_delay = "";
+	}
+      else
+	s += sprintf (s, "%%*");
+
+      s += sprintf (s, "%s%s%s%s%s\t%%%d%s", insn_name, and_link, reg, compact, short_delay,
+					    target_opno, nop);
+
+      if (!reg_p && TARGET_ABICALLS_PIC2)
+	s += sprintf (s, "\n\t.option\tpic2");
+    }
+  return buffer;
+}
+
 /* Return the assembly code for INSN, which has the operands given by
    OPERANDS, and which branches to OPERANDS[0] if some condition is true.
    BRANCH_IF_TRUE is the asm template that should be used if OPERANDS[0]
@@ -12505,7 +13576,7 @@ mips_output_conditional_branch (rtx insn, rtx *operands,
   if (length <= 8)
     {
       /* Just a simple conditional branch.  */
-      mips_branch_likely = (final_sequence && INSN_ANNULLED_BRANCH_P (insn));
+      mips_branch_likely = final_sequence && INSN_ANNULLED_BRANCH_P (insn);
       return branch_if_true;
     }
 
@@ -12538,12 +13609,25 @@ mips_output_conditional_branch (rtx insn, rtx *operands,
     }
 
   /* Output the unconditional branch to TAKEN.  */
-  if (TARGET_ABSOLUTE_JUMPS)
+  if (TARGET_ABSOLUTE_JUMPS && TARGET_CB_MAYBE)
+    {
+      /* Add a hazard nop.  */
+      if (!final_sequence)
+	{
+	  output_asm_insn ("nop\t\t# hazard nop", 0);
+	  fprintf (asm_out_file, "\n");
+	}
+      output_asm_insn (MIPS_ABSOLUTE_JUMP ("bc\t%0"), &taken);
+    }
+  else if (TARGET_ABSOLUTE_JUMPS)
     output_asm_insn (MIPS_ABSOLUTE_JUMP ("j\t%0%/"), &taken);
   else
     {
       mips_output_load_label (taken);
-      output_asm_insn ("jr\t%@%]%/", 0);
+      if (TARGET_CB_MAYBE)
+	output_asm_insn ("jrc\t%@%]", 0);
+      else
+	output_asm_insn ("jr\t%@%]%/", 0);
     }
 
   /* Now deal with its delay slot; see above.  */
@@ -12557,7 +13641,7 @@ mips_output_conditional_branch (rtx insn, rtx *operands,
 			   asm_out_file, optimize, 1, NULL);
 	  INSN_DELETED_P (XVECEXP (final_sequence, 0, 1)) = 1;
 	}
-      else
+      else if (TARGET_CB_NEVER)
 	output_asm_insn ("nop", 0);
       fprintf (asm_out_file, "\n");
     }
@@ -12568,6 +13652,58 @@ mips_output_conditional_branch (rtx insn, rtx *operands,
   return "";
 }
 
+const char *
+mips_output_equal_conditional_branch (rtx insn, rtx *operands, bool inverted_p)
+{
+  const char *branch[2];
+  /* For a simple BNEZ or BEQZ microMIPSr3 branch.  */
+  if (TARGET_MICROMIPS
+      && mips_isa_rev <= 5
+      && operands[3] == const0_rtx
+      && get_attr_length (insn) <= 8)
+    {
+      if (mips_cb == MIPS_CB_OPTIMAL)
+	{
+	  branch[!inverted_p] = "%*b%C1z%:\t%2,%0";
+	  branch[inverted_p] = "%*b%N1z%:\t%2,%0";
+	}
+      else
+	{
+	  branch[!inverted_p] = "%*b%C1z\t%2,%0%/";
+	  branch[inverted_p] = "%*b%N1z\t%2,%0%/";
+	}
+    }
+  else if (TARGET_CB_MAYBE)
+    {
+      if (operands[3] == const0_rtx)
+	{
+	  branch[!inverted_p] = MIPS_BRANCH_C ("b%C1z", "%2,%0");
+	  branch[inverted_p] = MIPS_BRANCH_C ("b%N1z", "%2,%0");
+	}
+      else if (REGNO (operands[2]) != REGNO (operands[3]))
+	{
+	  branch[!inverted_p] = MIPS_BRANCH_C ("b%C1", "%2,%3,%0");
+	  branch[inverted_p] = MIPS_BRANCH_C ("b%N1", "%2,%3,%0");
+	}
+      else
+	{
+	  /* This case is stupid.  Fix me.  */
+	  if (GET_CODE (operands[1]) == NE)
+	    inverted_p = !inverted_p;
+
+	  branch[!inverted_p] = MIPS_BRANCH_C ("b", "%0");
+	  branch[inverted_p] = "%*\t\t# branch never";
+	}
+    }
+  else
+    {
+      branch[!inverted_p] = MIPS_BRANCH ("b%C1", "%2,%z3,%0");
+      branch[inverted_p] = MIPS_BRANCH ("b%N1", "%2,%z3,%0");
+    }
+
+  return mips_output_conditional_branch (insn, operands, branch[1], branch[0]);
+}
+
 /* Return the assembly code for INSN, which branches to OPERANDS[0]
    if some ordering condition is true.  The condition is given by
    OPERANDS[1] if !INVERTED_P, otherwise it is the inverse of
@@ -12579,32 +13715,84 @@ mips_output_order_conditional_branch (rtx insn, rtx *operands, bool inverted_p)
 {
   const char *branch[2];
 
-  /* Make BRANCH[1] branch to OPERANDS[0] when the condition is true.
-     Make BRANCH[0] branch on the inverse condition.  */
-  switch (GET_CODE (operands[1]))
+  if (operands[3] != const0_rtx)
     {
-      /* These cases are equivalent to comparisons against zero.  */
-    case LEU:
-      inverted_p = !inverted_p;
-      /* Fall through.  */
-    case GTU:
-      branch[!inverted_p] = MIPS_BRANCH ("bne", "%2,%.,%0");
-      branch[inverted_p] = MIPS_BRANCH ("beq", "%2,%.,%0");
-      break;
+      if (REGNO (operands[2]) == REGNO (operands[3]))
+	{
+	  switch (GET_CODE (operands[1]))
+	    {
+	    case LT:
+	    case LTU:
+	      inverted_p = !inverted_p;
+	      /* Fall through.  */
+	    case GE:
+	    case GEU:
+	      branch[!inverted_p] = MIPS_BRANCH_C ("b", "%0");
+	      branch[inverted_p] = "%*\t\t# branch never";
+	      break;
+	   default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	{
+	  branch[!inverted_p] = MIPS_BRANCH_C ("b%C1", "%2,%3,%0");
+	  branch[inverted_p] = MIPS_BRANCH_C ("b%N1", "%2,%3,%0");
+	}
+    }
+  else
+    {
+      /* Make BRANCH[1] branch to OPERANDS[0] when the condition is true.
+	 Make BRANCH[0] branch on the inverse condition.  */
+      switch (GET_CODE (operands[1]))
+	{
+	  /* These cases are equivalent to comparisons against zero.  */
+	case LEU:
+	  inverted_p = !inverted_p;
+	  /* Fall through.  */
+	case GTU:
+	  if (TARGET_CB_MAYBE)
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH_C ("bnez", "%2,%0");
+	      branch[inverted_p] = MIPS_BRANCH_C ("beqz", "%2,%0");
+	    }
+	  else
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH ("bne", "%2,%.,%0");
+	      branch[inverted_p] = MIPS_BRANCH ("beq", "%2,%.,%0");
+	    }
+	  break;
 
-      /* These cases are always true or always false.  */
-    case LTU:
-      inverted_p = !inverted_p;
-      /* Fall through.  */
-    case GEU:
-      branch[!inverted_p] = MIPS_BRANCH ("beq", "%.,%.,%0");
-      branch[inverted_p] = MIPS_BRANCH ("bne", "%.,%.,%0");
-      break;
+	  /* These cases are always true or always false.  */
+	case LTU:
+	  inverted_p = !inverted_p;
+	  /* Fall through.  */
+	case GEU:
+	  if (TARGET_CB_MAYBE)
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH_C ("b", "%0");
+	      branch[inverted_p] = "%*\t\t# branch never";
+	    }
+	  else
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH ("beq", "%.,%.,%0");
+	      branch[inverted_p] = MIPS_BRANCH ("bne", "%.,%.,%0");
+	    }
+	  break;
 
-    default:
-      branch[!inverted_p] = MIPS_BRANCH ("b%C1z", "%2,%0");
-      branch[inverted_p] = MIPS_BRANCH ("b%N1z", "%2,%0");
-      break;
+	default:
+	  if (TARGET_CB_MAYBE)
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH_C ("b%C1z", "%2,%0");
+	      branch[inverted_p] = MIPS_BRANCH_C ("b%N1z", "%2,%0");
+	    }
+	  else
+	    {
+	      branch[!inverted_p] = MIPS_BRANCH ("b%C1z", "%2,%0");
+	      branch[inverted_p] = MIPS_BRANCH ("b%N1z", "%2,%0");
+	    }
+	  break;
+	}
     }
   return mips_output_conditional_branch (insn, operands, branch[1], branch[0]);
 }
@@ -12731,7 +13919,7 @@ mips_process_sync_loop (rtx insn, rtx *operands)
      is specified.  */
 #define READ_OPERAND(WHAT, DEFAULT) \
   WHAT = mips_get_sync_operand (operands, (int) get_attr_sync_##WHAT (insn), \
-  				DEFAULT)
+				DEFAULT)
 
   /* Read the memory.  */
   READ_OPERAND (mem, 0);
@@ -12807,11 +13995,18 @@ mips_process_sync_loop (rtx insn, rtx *operands)
 			       at, oldval, inclusive_mask, NULL);
 	  tmp1 = at;
 	}
-      mips_multi_add_insn ("bne\t%0,%z1,2f", tmp1, required_oldval, NULL);
+      if (TARGET_CB_NEVER)
+	mips_multi_add_insn ("bne\t%0,%z1,2f", tmp1, required_oldval, NULL);
 
       /* CMP = 0 [delay slot].  */
       if (cmp)
         mips_multi_add_insn ("li\t%0,0", cmp, NULL);
+
+      if (TARGET_CB_MAYBE && required_oldval == const0_rtx)
+	mips_multi_add_insn ("bnezc\t%0,2f", tmp1, NULL);
+      else if (TARGET_CB_MAYBE)
+	mips_multi_add_insn ("bnec\t%0,%1,2f", tmp1, required_oldval, NULL);
+
     }
 
   /* $TMP1 = OLDVAL & EXCLUSIVE_MASK.  */
@@ -12867,7 +14062,17 @@ mips_process_sync_loop (rtx insn, rtx *operands)
      This will sometimes be a delayed branch; see the write code below
      for details.  */
   mips_multi_add_insn (is_64bit_p ? "scd\t%0,%1" : "sc\t%0,%1", at, mem, NULL);
-  mips_multi_add_insn ("beq%?\t%0,%.,1b", at, NULL);
+
+  /* When using branch likely (-mfix-r10000), the delay slot instruction
+     will be annulled on false.  The normal delay slot instructions
+     calculate the overall result of the atomic operation and must not
+     be annulled.  To ensure this behaviour unconditionally use a NOP
+     in the delay slot for the branch likely case.  */
+
+  if (TARGET_CB_MAYBE)
+    mips_multi_add_insn ("beqzc\t%0,1b", at, NULL);
+  else
+    mips_multi_add_insn ("beq%?\t%0,%.,1b%~", at, NULL);
 
   /* if (INSN1 != MOVE && INSN1 != LI) NEWVAL = $TMP3 [delay slot].  */
   if (insn1 != SYNC_INSN1_MOVE && insn1 != SYNC_INSN1_LI && tmp3 != newval)
@@ -12875,7 +14080,7 @@ mips_process_sync_loop (rtx insn, rtx *operands)
       mips_multi_copy_insn (tmp3_insn);
       mips_multi_set_operand (mips_multi_last_index (), 0, newval);
     }
-  else if (!(required_oldval && cmp))
+  else if (!(required_oldval && cmp) && !mips_branch_likely)
     mips_multi_add_insn ("nop", NULL);
 
   /* CMP = 1 -- either standalone or in a delay slot.  */
@@ -12899,12 +14104,12 @@ mips_process_sync_loop (rtx insn, rtx *operands)
 const char *
 mips_output_sync_loop (rtx insn, rtx *operands)
 {
-  mips_process_sync_loop (insn, operands);
-
   /* Use branch-likely instructions to work around the LL/SC R10000
      errata.  */
   mips_branch_likely = TARGET_FIX_R10000;
 
+  mips_process_sync_loop (insn, operands);
+
   mips_push_asm_switch (&mips_noreorder);
   mips_push_asm_switch (&mips_nomacro);
   mips_push_asm_switch (&mips_noat);
@@ -12926,6 +14131,9 @@ mips_output_sync_loop (rtx insn, rtx *operands)
 unsigned int
 mips_sync_loop_insns (rtx insn, rtx *operands)
 {
+  /* Use branch-likely instructions to work around the LL/SC R10000
+     errata.  */
+  mips_branch_likely = TARGET_FIX_R10000;
   mips_process_sync_loop (insn, operands);
   return mips_multi_num_insns;
 }
@@ -13058,7 +14266,47 @@ mips_output_division (const char *division, rtx *operands)
     }
   return s;
 }
+
+const char *
+mips_msa_output_division (const char *division, rtx *operands)
+{
+  const char *s;
+
+  s = division;
+  if (TARGET_CHECK_ZERO_DIV)
+    {
+      output_asm_insn ("%(bnz.%v0\t%w2,1f", operands);
+      output_asm_insn (s, operands);
+      s = "break\t7%)\n1:";
+    }
+  return s;
+}
 
+/* Return true if destination of IN_INSN is used as add source in
+   OUT_INSN. Both IN_INSN and OUT_INSN are of type fmadd. Example:
+   madd.s dst, x, y, z
+   madd.s a, dst, b, c  */
+
+bool
+mips_fmadd_bypass (rtx out_insn, rtx in_insn)
+{
+  int dst_reg, src_reg;
+
+  gcc_assert (get_attr_type (in_insn) == TYPE_FMADD);
+  gcc_assert (get_attr_type (out_insn) == TYPE_FMADD);
+
+  extract_insn (in_insn);
+  dst_reg = REG_P (recog_data.operand[0]);
+
+  extract_insn (out_insn);
+  src_reg = REG_P (recog_data.operand[1]);
+
+  if (dst_reg == src_reg)
+    return true;
+
+  return false;
+}
+
 /* Return true if IN_INSN is a multiply-add or multiply-subtract
    instruction and if OUT_INSN assigns to the accumulator operand.  */
 
@@ -13181,6 +14429,8 @@ mips_issue_rate (void)
     case PROCESSOR_R9000:
     case PROCESSOR_OCTEON:
     case PROCESSOR_OCTEON2:
+    case PROCESSOR_OCTEON3:
+    case PROCESSOR_I6400:
       return 2;
 
     case PROCESSOR_SB1:
@@ -13194,6 +14444,7 @@ mips_issue_rate (void)
     case PROCESSOR_LOONGSON_2E:
     case PROCESSOR_LOONGSON_2F:
     case PROCESSOR_LOONGSON_3A:
+    case PROCESSOR_P5600:
       return 4;
 
     case PROCESSOR_XLP:
@@ -13329,6 +14580,9 @@ mips_multipass_dfa_lookahead (void)
   if (TUNE_OCTEON)
     return 2;
 
+  if (TUNE_P5600 || TUNE_I6400)
+    return 4;
+
   return 0;
 }
 
@@ -13579,6 +14833,219 @@ mips_74k_agen_reorder (rtx *ready, int nready)
       break;
     }
 }
+
+/* These functions are called when -msched-weight is set.  */
+
+/* Find register born in given X if any.  */
+
+static int
+find_reg_born (rtx x)
+{
+  if (GET_CODE (x) == CLOBBER)
+    return 1;
+
+  if (GET_CODE (x) == SET)
+    {
+      if (REG_P (SET_DEST (x)) && reg_mentioned_p (SET_DEST (x), SET_SRC (x)))
+	return 0;
+      return 1;
+    }
+  return 0;
+}
+
+/* Calculate register weight for given INSN.  */
+
+static int
+get_weight (rtx insn)
+{
+  int weight = 0;
+  rtx x;
+
+  /* Increment weight for each register born here.  */
+  x = PATTERN (insn);
+  weight = find_reg_born (x);
+
+  if (GET_CODE (x) == PARALLEL)
+    {
+      int i;
+      for (i = XVECLEN (x, 0) - 1; i >= 0; i--)
+	{
+	  x = XVECEXP (PATTERN (insn), 0, i);
+	  weight += find_reg_born (x);
+	}
+    }
+
+  /* Decrement weight for each register that dies here.  */
+  for (x = REG_NOTES (insn); x; x = XEXP (x, 1))
+    {
+      if (REG_NOTE_KIND (x) == REG_DEAD || REG_NOTE_KIND (x) == REG_UNUSED)
+	{
+	  rtx note = XEXP (x, 0);
+	  if (REG_P (note))
+	    weight--;
+	}
+    }
+  return weight;
+}
+
+/* TARGET_SCHED_WEIGHT helper function.
+   Allocate and initialize global data.  */
+
+static void
+mips_weight_init_global (int old_max_uid)
+{
+  level = (int *) xcalloc (old_max_uid, sizeof (int));
+  consumer_luid = (int *) xcalloc (old_max_uid, sizeof (int));
+}
+
+/* Implement TARGET_SCHED_INIT_GLOBAL.  */
+
+static void
+mips_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
+			int verbose ATTRIBUTE_UNUSED,
+			int old_max_uid)
+{
+  if (!reload_completed && TARGET_SCHED_WEIGHT)
+    mips_weight_init_global (old_max_uid);
+}
+
+/* TARGET_SCHED_WEIGHT helper function. Called for each basic block
+   with dependency chain information in HEAD and TAIL.
+   Calculates LEVEL for each INSN from its forward dependencies
+   and finds out UID of first consumer instruction (CONSUMER_LUID) of INSN.  */
+
+static void
+mips_weight_evaluation (rtx head, rtx tail)
+{
+  sd_iterator_def sd_it;
+  dep_t dep;
+  rtx prev_head, insn, x;
+  prev_head = PREV_INSN (head);
+
+  for (insn = tail; insn != prev_head; insn = PREV_INSN (insn))
+    if (INSN_P (insn))
+      {
+	FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+	  {
+	    x = DEP_CON (dep);
+	    if (! DEBUG_INSN_P (x))
+	      {
+		if (LEVEL (x) > LEVEL (insn))
+		  LEVEL (insn) = LEVEL (x);
+		CONSUMER_LUID (insn) = INSN_LUID (x);
+	      }
+	  }
+	LEVEL (insn)++;
+      }
+}
+
+/* Implement TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK.  */
+
+static void
+mips_evaluation_hook (rtx head, rtx tail)
+{
+  if (!reload_completed && TARGET_SCHED_WEIGHT)
+    mips_weight_evaluation (head, tail);
+}
+
+/* Implement TARGET_SCHED_SET_SCHED_FLAGS.
+   Enables DONT_BREAK_DEPENDENCIES for the first scheduling pass.
+   It prevents breaking of dependencies on mem/inc pair in the first pass
+   which would otherwise increase stalls.  */
+
+static void
+mips_set_sched_flags (spec_info_t spec_info ATTRIBUTE_UNUSED)
+{
+  if (!reload_completed && TARGET_SCHED_WEIGHT)
+    {
+      unsigned int *flags = &(current_sched_info->flags);
+      *flags |= DONT_BREAK_DEPENDENCIES;
+    }
+}
+
+static void
+mips_weight_finish_global ()
+{
+  if (level != NULL)
+    free (level);
+
+  if (consumer_luid != NULL)
+    free (consumer_luid);
+}
+
+/* Implement TARGET_SCHED_FINISH_GLOBAL.  */
+
+static void
+mips_sched_finish_global (FILE *dump ATTRIBUTE_UNUSED,
+			  int verbose ATTRIBUTE_UNUSED)
+{
+  if (!reload_completed && TARGET_SCHED_WEIGHT)
+    mips_weight_finish_global ();
+}
+
+
+/* This is a TARGET_SCHED_WEIGHT (option -msched-weight) helper function
+   which is called during reordering of instructions in the first pass
+   of the scheduler. The function swaps the instruction at (NREADY - 1)
+   of the READY list with another instruction in READY list as per
+   the following algorithm. The scheduler then picks the instruction
+   at READY[NREADY - 1] and schedules it.
+
+   Every instruction is assigned with a value LEVEL.
+   [See: mips_weight_evaluation().]
+
+   1. INSN with highest LEVEL is chosen to be scheduled next, ties broken by
+      1a. Choosing INSN that is used early in the flow or
+      1b. Choosing INSN with greater INSN_TICK.
+
+   2. Choose INSN having less LEVEL number iff,
+      2a. It is used early and
+      2b. Has greater INSN_TICK and
+      2c. Contributes less to the register pressure.  */
+
+static void
+mips_sched_weight (rtx *ready, int nready)
+{
+  int max_level = LEVEL (ready[nready-1]), toswap = nready-1;
+  int i;
+#define INSN_TICK(INSN) (HID (INSN)->tick)
+
+  for (i = nready - 2; i >= 0; i--)
+    {
+      rtx insn = ready[i];
+      if (LEVEL (insn) == max_level)
+	{
+	  if (INSN_PRIORITY (insn) >= INSN_PRIORITY (ready[toswap]))
+	    {
+	      if (CONSUMER_LUID (insn) < CONSUMER_LUID (ready[toswap]))
+		toswap = i;
+	    }
+	  else if (INSN_TICK (insn) > INSN_TICK(ready[toswap]))
+	    toswap = i;
+	}
+      if (LEVEL (insn) > max_level)
+	{
+	  max_level = LEVEL (insn);
+	  toswap = i;
+	}
+      if (LEVEL (insn) < max_level)
+	{
+	  if (CONSUMER_LUID (insn) < CONSUMER_LUID (ready[toswap])
+	      && INSN_TICK (insn) > INSN_TICK(ready[toswap])
+	      && get_weight (insn) < get_weight (ready[toswap]))
+	    toswap = i;
+	}
+    }
+
+  if (toswap != (nready-1))
+    {
+      rtx temp = ready[nready-1];
+      ready[nready-1] = ready[toswap];
+      ready[toswap] = temp;
+    }
+#undef INSN_TICK
+}
+
 
 /* Implement TARGET_SCHED_INIT.  */
 
@@ -13616,6 +15083,11 @@ mips_sched_reorder_1 (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
 
   if (TUNE_74K)
     mips_74k_agen_reorder (ready, *nreadyp);
+
+  if (! reload_completed
+      && TARGET_SCHED_WEIGHT
+      && *nreadyp > 1)
+    mips_sched_weight (ready, *nreadyp);
 }
 
 /* Implement TARGET_SCHED_REORDER.  */
@@ -13780,6 +15252,7 @@ AVAIL_NON_MIPS16 (dsp_64, TARGET_64BIT && TARGET_DSP)
 AVAIL_NON_MIPS16 (dspr2_32, !TARGET_64BIT && TARGET_DSPR2)
 AVAIL_NON_MIPS16 (loongson, TARGET_LOONGSON_VECTORS)
 AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
+AVAIL_NON_MIPS16 (msa, TARGET_MSA)
 
 /* Construct a mips_builtin_description from the given arguments.
 
@@ -13896,6 +15369,22 @@ AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
 #define LOONGSON_BUILTIN_SUFFIX(INSN, SUFFIX, FUNCTION_TYPE)		\
   LOONGSON_BUILTIN_ALIAS (INSN, INSN ## _ ## SUFFIX, FUNCTION_TYPE)
 
+/* Define a MSA MIPS_BUILTIN_DIRECT function __builtin_msa_<INSN>
+   for instruction CODE_FOR_msa_<INSN>.  FUNCTION_TYPE is a
+   builtin_description field.  */
+#define MSA_BUILTIN(INSN, FUNCTION_TYPE)				\
+    { CODE_FOR_msa_ ## INSN, MIPS_FP_COND_f,				\
+    "__builtin_msa_" #INSN,  MIPS_BUILTIN_DIRECT,			\
+    FUNCTION_TYPE, mips_builtin_avail_msa }
+
+/* Define a MSA MIPS_BUILTIN_DIRECT_NO_TARGET function __builtin_msa_<INSN>
+   for instruction CODE_FOR_msa_<INSN>.  FUNCTION_TYPE is a
+   builtin_description field.  */
+#define MSA_NO_TARGET_BUILTIN(INSN, FUNCTION_TYPE)			\
+    { CODE_FOR_msa_ ## INSN, MIPS_FP_COND_f,				\
+    "__builtin_msa_" #INSN,  MIPS_BUILTIN_DIRECT_NO_TARGET,		\
+    FUNCTION_TYPE, mips_builtin_avail_msa }
+
 #define CODE_FOR_mips_sqrt_ps CODE_FOR_sqrtv2sf2
 #define CODE_FOR_mips_addq_ph CODE_FOR_addv2hi3
 #define CODE_FOR_mips_addu_qb CODE_FOR_addv4qi3
@@ -13936,6 +15425,138 @@ AVAIL_NON_MIPS16 (cache, TARGET_CACHE_BUILTIN)
 #define CODE_FOR_loongson_psubush CODE_FOR_ussubv4hi3
 #define CODE_FOR_loongson_psubusb CODE_FOR_ussubv8qi3
 
+#define CODE_FOR_msa_adds_s_b CODE_FOR_ssaddv16qi3
+#define CODE_FOR_msa_adds_s_h CODE_FOR_ssaddv8hi3
+#define CODE_FOR_msa_adds_s_w CODE_FOR_ssaddv4si3
+#define CODE_FOR_msa_adds_s_d CODE_FOR_ssaddv2di3
+#define CODE_FOR_msa_adds_u_b CODE_FOR_usaddv16qi3
+#define CODE_FOR_msa_adds_u_h CODE_FOR_usaddv8hi3
+#define CODE_FOR_msa_adds_u_w CODE_FOR_usaddv4si3
+#define CODE_FOR_msa_adds_u_d CODE_FOR_usaddv2di3
+#define CODE_FOR_msa_addv_b CODE_FOR_addv16qi3
+#define CODE_FOR_msa_addv_h CODE_FOR_addv8hi3
+#define CODE_FOR_msa_addv_w CODE_FOR_addv4si3
+#define CODE_FOR_msa_addv_d CODE_FOR_addv2di3
+#define CODE_FOR_msa_and_v CODE_FOR_andv16qi3
+#define CODE_FOR_msa_bmnz_v CODE_FOR_msa_bmnz_v_b
+#define CODE_FOR_msa_bmz_v CODE_FOR_msa_bmz_v_b
+#define CODE_FOR_msa_bnz_v CODE_FOR_msa_bnz_v_b
+#define CODE_FOR_msa_bz_v CODE_FOR_msa_bz_v_b
+#define CODE_FOR_msa_bsel_v CODE_FOR_msa_bsel_v_b
+#define CODE_FOR_msa_div_s_b CODE_FOR_divv16qi3
+#define CODE_FOR_msa_div_s_h CODE_FOR_divv8hi3
+#define CODE_FOR_msa_div_s_w CODE_FOR_divv4si3
+#define CODE_FOR_msa_div_s_d CODE_FOR_divv2di3
+#define CODE_FOR_msa_div_u_b CODE_FOR_udivv16qi3
+#define CODE_FOR_msa_div_u_h CODE_FOR_udivv8hi3
+#define CODE_FOR_msa_div_u_w CODE_FOR_udivv4si3
+#define CODE_FOR_msa_div_u_d CODE_FOR_udivv2di3
+#define CODE_FOR_msa_fadd_w CODE_FOR_addv4sf3
+#define CODE_FOR_msa_fadd_d CODE_FOR_addv2df3
+#define CODE_FOR_msa_ffint_s_w CODE_FOR_floatv4siv4sf2
+#define CODE_FOR_msa_ffint_s_d CODE_FOR_floatv2div2df2
+#define CODE_FOR_msa_ffint_u_w CODE_FOR_floatunsv4siv4sf2
+#define CODE_FOR_msa_ffint_u_d CODE_FOR_floatunsv2div2df2
+#define CODE_FOR_msa_fsub_w CODE_FOR_subv4sf3
+#define CODE_FOR_msa_fsub_d CODE_FOR_subv2df3
+#define CODE_FOR_msa_fmul_w CODE_FOR_mulv4sf3
+#define CODE_FOR_msa_fmul_d CODE_FOR_mulv2df3
+#define CODE_FOR_msa_fdiv_w CODE_FOR_divv4sf3
+#define CODE_FOR_msa_fdiv_d CODE_FOR_divv2df3
+#define CODE_FOR_msa_fmax_w CODE_FOR_smaxv4sf3
+#define CODE_FOR_msa_fmax_d CODE_FOR_smaxv2df3
+#define CODE_FOR_msa_fmax_a_w CODE_FOR_umaxv4sf3
+#define CODE_FOR_msa_fmax_a_d CODE_FOR_umaxv2df3
+#define CODE_FOR_msa_fmin_w CODE_FOR_sminv4sf3
+#define CODE_FOR_msa_fmin_d CODE_FOR_sminv2df3
+#define CODE_FOR_msa_fmin_a_w CODE_FOR_uminv4sf3
+#define CODE_FOR_msa_fmin_a_d CODE_FOR_uminv2df3
+#define CODE_FOR_msa_fsqrt_w CODE_FOR_sqrtv4sf2
+#define CODE_FOR_msa_fsqrt_d CODE_FOR_sqrtv2df2
+#define CODE_FOR_msa_max_s_b CODE_FOR_smaxv16qi3
+#define CODE_FOR_msa_max_s_h CODE_FOR_smaxv8hi3
+#define CODE_FOR_msa_max_s_w CODE_FOR_smaxv4si3
+#define CODE_FOR_msa_max_s_d CODE_FOR_smaxv2di3
+#define CODE_FOR_msa_max_u_b CODE_FOR_umaxv16qi3
+#define CODE_FOR_msa_max_u_h CODE_FOR_umaxv8hi3
+#define CODE_FOR_msa_max_u_w CODE_FOR_umaxv4si3
+#define CODE_FOR_msa_max_u_d CODE_FOR_umaxv2di3
+#define CODE_FOR_msa_min_s_b CODE_FOR_sminv16qi3
+#define CODE_FOR_msa_min_s_h CODE_FOR_sminv8hi3
+#define CODE_FOR_msa_min_s_w CODE_FOR_sminv4si3
+#define CODE_FOR_msa_min_s_d CODE_FOR_sminv2di3
+#define CODE_FOR_msa_min_u_b CODE_FOR_uminv16qi3
+#define CODE_FOR_msa_min_u_h CODE_FOR_uminv8hi3
+#define CODE_FOR_msa_min_u_w CODE_FOR_uminv4si3
+#define CODE_FOR_msa_min_u_d CODE_FOR_uminv2di3
+#define CODE_FOR_msa_mod_s_b CODE_FOR_modv16qi3
+#define CODE_FOR_msa_mod_s_h CODE_FOR_modv8hi3
+#define CODE_FOR_msa_mod_s_w CODE_FOR_modv4si3
+#define CODE_FOR_msa_mod_s_d CODE_FOR_modv2di3
+#define CODE_FOR_msa_mod_u_b CODE_FOR_umodv16qi3
+#define CODE_FOR_msa_mod_u_h CODE_FOR_umodv8hi3
+#define CODE_FOR_msa_mod_u_w CODE_FOR_umodv4si3
+#define CODE_FOR_msa_mod_u_d CODE_FOR_umodv2di3
+#define CODE_FOR_msa_mod_s_b CODE_FOR_modv16qi3
+#define CODE_FOR_msa_mod_s_h CODE_FOR_modv8hi3
+#define CODE_FOR_msa_mod_s_w CODE_FOR_modv4si3
+#define CODE_FOR_msa_mod_s_d CODE_FOR_modv2di3
+#define CODE_FOR_msa_mod_u_b CODE_FOR_umodv16qi3
+#define CODE_FOR_msa_mod_u_h CODE_FOR_umodv8hi3
+#define CODE_FOR_msa_mod_u_w CODE_FOR_umodv4si3
+#define CODE_FOR_msa_mod_u_d CODE_FOR_umodv2di3
+#define CODE_FOR_msa_mulv_b CODE_FOR_mulv16qi3
+#define CODE_FOR_msa_mulv_h CODE_FOR_mulv8hi3
+#define CODE_FOR_msa_mulv_w CODE_FOR_mulv4si3
+#define CODE_FOR_msa_mulv_d CODE_FOR_mulv2di3
+#define CODE_FOR_msa_nlzc_b CODE_FOR_clzv16qi2
+#define CODE_FOR_msa_nlzc_h CODE_FOR_clzv8hi2
+#define CODE_FOR_msa_nlzc_w CODE_FOR_clzv4si2
+#define CODE_FOR_msa_nlzc_d CODE_FOR_clzv2di2
+#define CODE_FOR_msa_nor_v CODE_FOR_msa_nor_v_b
+#define CODE_FOR_msa_or_v CODE_FOR_iorv16qi3
+#define CODE_FOR_msa_pcnt_b CODE_FOR_popcountv16qi2
+#define CODE_FOR_msa_pcnt_h CODE_FOR_popcountv8hi2
+#define CODE_FOR_msa_pcnt_w CODE_FOR_popcountv4si2
+#define CODE_FOR_msa_pcnt_d CODE_FOR_popcountv2di2
+#define CODE_FOR_msa_xor_v CODE_FOR_xorv16qi3
+#define CODE_FOR_msa_sll_b CODE_FOR_vashlv16qi3
+#define CODE_FOR_msa_sll_h CODE_FOR_vashlv8hi3
+#define CODE_FOR_msa_sll_w CODE_FOR_vashlv4si3
+#define CODE_FOR_msa_sll_d CODE_FOR_vashlv2di3
+#define CODE_FOR_msa_sra_b CODE_FOR_vashrv16qi3
+#define CODE_FOR_msa_sra_h CODE_FOR_vashrv8hi3
+#define CODE_FOR_msa_sra_w CODE_FOR_vashrv4si3
+#define CODE_FOR_msa_sra_d CODE_FOR_vashrv2di3
+#define CODE_FOR_msa_srl_b CODE_FOR_vlshrv16qi3
+#define CODE_FOR_msa_srl_h CODE_FOR_vlshrv8hi3
+#define CODE_FOR_msa_srl_w CODE_FOR_vlshrv4si3
+#define CODE_FOR_msa_srl_d CODE_FOR_vlshrv2di3
+#define CODE_FOR_msa_subv_b CODE_FOR_subv16qi3
+#define CODE_FOR_msa_subv_h CODE_FOR_subv8hi3
+#define CODE_FOR_msa_subv_w CODE_FOR_subv4si3
+#define CODE_FOR_msa_subv_d CODE_FOR_subv2di3
+
+#define CODE_FOR_msa_move_v CODE_FOR_movv16qi
+
+#define CODE_FOR_msa_vshf_b CODE_FOR_msa_vshfv16qi
+#define CODE_FOR_msa_vshf_h CODE_FOR_msa_vshfv8hi
+#define CODE_FOR_msa_vshf_w CODE_FOR_msa_vshfv4si
+#define CODE_FOR_msa_vshf_d CODE_FOR_msa_vshfv2di
+
+#define CODE_FOR_msa_ilvod_d CODE_FOR_msa_ilvl_d
+#define CODE_FOR_msa_ilvev_d CODE_FOR_msa_ilvr_d
+
+#define CODE_FOR_msa_ldi_b CODE_FOR_msa_ldiv16qi
+#define CODE_FOR_msa_ldi_h CODE_FOR_msa_ldiv8hi
+#define CODE_FOR_msa_ldi_w CODE_FOR_msa_ldiv4si
+#define CODE_FOR_msa_ldi_d CODE_FOR_msa_ldiv2di
+
+#define CODE_FOR_msa_cast_to_vector_float CODE_FOR_msa_cast_to_vector_w_f
+#define CODE_FOR_msa_cast_to_vector_double CODE_FOR_msa_cast_to_vector_d_f
+#define CODE_FOR_msa_cast_to_scalar_float CODE_FOR_msa_cast_to_scalar_w_f
+#define CODE_FOR_msa_cast_to_scalar_double CODE_FOR_msa_cast_to_scalar_d_f
+
 static const struct mips_builtin_description mips_builtins[] = {
 #define MIPS_GET_FCSR 0
   DIRECT_BUILTIN (get_fcsr, MIPS_USI_FTYPE_VOID, hard_float),
@@ -14224,18 +15845,557 @@ static const struct mips_builtin_description mips_builtins[] = {
   LOONGSON_BUILTIN_SUFFIX (punpcklwd, s, MIPS_V2SI_FTYPE_V2SI_V2SI),
 
   /* Sundry other built-in functions.  */
-  DIRECT_NO_TARGET_BUILTIN (cache, MIPS_VOID_FTYPE_SI_CVPOINTER, cache)
+  DIRECT_NO_TARGET_BUILTIN (cache, MIPS_VOID_FTYPE_SI_CVPOINTER, cache),
+
+  /* Built-in functions for MSA.  */
+  MSA_BUILTIN (sll_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (sll_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (sll_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (sll_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (slli_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (slli_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (slli_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (slli_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (sra_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (sra_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (sra_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (sra_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (srai_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (srai_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (srai_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (srai_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (srar_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (srar_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (srar_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (srar_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (srari_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (srari_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (srari_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (srari_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (srl_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (srl_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (srl_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (srl_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (srli_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (srli_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (srli_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (srli_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (srlr_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (srlr_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (srlr_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (srlr_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (srlri_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (srlri_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (srlri_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (srlri_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (bclr_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (bclr_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (bclr_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (bclr_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (bclri_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (bclri_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (bclri_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (bclri_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (bset_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (bset_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (bset_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (bset_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (bseti_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (bseti_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (bseti_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (bseti_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (bneg_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (bneg_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (bneg_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (bneg_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (bnegi_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (bnegi_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (bnegi_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (bnegi_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (binsl_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UV16QI),
+  MSA_BUILTIN (binsl_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI_UV8HI),
+  MSA_BUILTIN (binsl_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI_UV4SI),
+  MSA_BUILTIN (binsl_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI_UV2DI),
+  MSA_BUILTIN (binsli_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UQI),
+  MSA_BUILTIN (binsli_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI_UQI),
+  MSA_BUILTIN (binsli_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI_UQI),
+  MSA_BUILTIN (binsli_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI_UQI),
+  MSA_BUILTIN (binsr_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UV16QI),
+  MSA_BUILTIN (binsr_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI_UV8HI),
+  MSA_BUILTIN (binsr_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI_UV4SI),
+  MSA_BUILTIN (binsr_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI_UV2DI),
+  MSA_BUILTIN (binsri_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UQI),
+  MSA_BUILTIN (binsri_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI_UQI),
+  MSA_BUILTIN (binsri_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI_UQI),
+  MSA_BUILTIN (binsri_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI_UQI),
+  MSA_BUILTIN (addv_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (addv_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (addv_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (addv_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (addvi_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (addvi_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (addvi_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (addvi_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (subv_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (subv_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (subv_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (subv_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (subvi_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (subvi_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (subvi_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (subvi_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (max_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (max_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (max_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (max_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (maxi_s_b, MIPS_V16QI_FTYPE_V16QI_QI),
+  MSA_BUILTIN (maxi_s_h, MIPS_V8HI_FTYPE_V8HI_QI),
+  MSA_BUILTIN (maxi_s_w, MIPS_V4SI_FTYPE_V4SI_QI),
+  MSA_BUILTIN (maxi_s_d, MIPS_V2DI_FTYPE_V2DI_QI),
+  MSA_BUILTIN (max_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (max_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (max_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (max_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (maxi_u_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (maxi_u_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (maxi_u_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (maxi_u_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (min_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (min_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (min_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (min_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (mini_s_b, MIPS_V16QI_FTYPE_V16QI_QI),
+  MSA_BUILTIN (mini_s_h, MIPS_V8HI_FTYPE_V8HI_QI),
+  MSA_BUILTIN (mini_s_w, MIPS_V4SI_FTYPE_V4SI_QI),
+  MSA_BUILTIN (mini_s_d, MIPS_V2DI_FTYPE_V2DI_QI),
+  MSA_BUILTIN (min_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (min_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (min_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (min_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (mini_u_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (mini_u_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (mini_u_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (mini_u_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (max_a_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (max_a_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (max_a_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (max_a_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (min_a_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (min_a_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (min_a_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (min_a_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ceq_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ceq_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ceq_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ceq_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ceqi_b, MIPS_V16QI_FTYPE_V16QI_QI),
+  MSA_BUILTIN (ceqi_h, MIPS_V8HI_FTYPE_V8HI_QI),
+  MSA_BUILTIN (ceqi_w, MIPS_V4SI_FTYPE_V4SI_QI),
+  MSA_BUILTIN (ceqi_d, MIPS_V2DI_FTYPE_V2DI_QI),
+  MSA_BUILTIN (clt_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (clt_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (clt_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (clt_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (clti_s_b, MIPS_V16QI_FTYPE_V16QI_QI),
+  MSA_BUILTIN (clti_s_h, MIPS_V8HI_FTYPE_V8HI_QI),
+  MSA_BUILTIN (clti_s_w, MIPS_V4SI_FTYPE_V4SI_QI),
+  MSA_BUILTIN (clti_s_d, MIPS_V2DI_FTYPE_V2DI_QI),
+  MSA_BUILTIN (clt_u_b, MIPS_V16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (clt_u_h, MIPS_V8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (clt_u_w, MIPS_V4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (clt_u_d, MIPS_V2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (clti_u_b, MIPS_V16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (clti_u_h, MIPS_V8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (clti_u_w, MIPS_V4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (clti_u_d, MIPS_V2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (cle_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (cle_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (cle_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (cle_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (clei_s_b, MIPS_V16QI_FTYPE_V16QI_QI),
+  MSA_BUILTIN (clei_s_h, MIPS_V8HI_FTYPE_V8HI_QI),
+  MSA_BUILTIN (clei_s_w, MIPS_V4SI_FTYPE_V4SI_QI),
+  MSA_BUILTIN (clei_s_d, MIPS_V2DI_FTYPE_V2DI_QI),
+  MSA_BUILTIN (cle_u_b, MIPS_V16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (cle_u_h, MIPS_V8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (cle_u_w, MIPS_V4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (cle_u_d, MIPS_V2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (clei_u_b, MIPS_V16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (clei_u_h, MIPS_V8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (clei_u_w, MIPS_V4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (clei_u_d, MIPS_V2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (ld_b, MIPS_V16QI_FTYPE_POINTER_SI),
+  MSA_BUILTIN (ld_h, MIPS_V8HI_FTYPE_POINTER_SI),
+  MSA_BUILTIN (ld_w, MIPS_V4SI_FTYPE_POINTER_SI),
+  MSA_BUILTIN (ld_d, MIPS_V2DI_FTYPE_POINTER_SI),
+  MSA_NO_TARGET_BUILTIN (st_b, MIPS_VOID_FTYPE_V16QI_POINTER_SI),
+  MSA_NO_TARGET_BUILTIN (st_h, MIPS_VOID_FTYPE_V8HI_POINTER_SI),
+  MSA_NO_TARGET_BUILTIN (st_w, MIPS_VOID_FTYPE_V4SI_POINTER_SI),
+  MSA_NO_TARGET_BUILTIN (st_d, MIPS_VOID_FTYPE_V2DI_POINTER_SI),
+  MSA_BUILTIN (sat_s_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (sat_s_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (sat_s_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (sat_s_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (sat_u_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (sat_u_h, MIPS_UV8HI_FTYPE_UV8HI_UQI),
+  MSA_BUILTIN (sat_u_w, MIPS_UV4SI_FTYPE_UV4SI_UQI),
+  MSA_BUILTIN (sat_u_d, MIPS_UV2DI_FTYPE_UV2DI_UQI),
+  MSA_BUILTIN (add_a_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (add_a_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (add_a_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (add_a_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (adds_a_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (adds_a_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (adds_a_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (adds_a_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (adds_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (adds_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (adds_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (adds_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (adds_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (adds_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (adds_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (adds_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (ave_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ave_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ave_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ave_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ave_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (ave_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (ave_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (ave_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (aver_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (aver_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (aver_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (aver_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (aver_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (aver_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (aver_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (aver_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (subs_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (subs_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (subs_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (subs_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (subs_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (subs_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (subs_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (subs_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (subsuu_s_b, MIPS_V16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (subsuu_s_h, MIPS_V8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (subsuu_s_w, MIPS_V4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (subsuu_s_d, MIPS_V2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (subsus_u_b, MIPS_UV16QI_FTYPE_UV16QI_V16QI),
+  MSA_BUILTIN (subsus_u_h, MIPS_UV8HI_FTYPE_UV8HI_V8HI),
+  MSA_BUILTIN (subsus_u_w, MIPS_UV4SI_FTYPE_UV4SI_V4SI),
+  MSA_BUILTIN (subsus_u_d, MIPS_UV2DI_FTYPE_UV2DI_V2DI),
+  MSA_BUILTIN (asub_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (asub_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (asub_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (asub_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (asub_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (asub_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (asub_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (asub_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (mulv_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (mulv_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (mulv_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (mulv_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (maddv_b, MIPS_V16QI_FTYPE_V16QI_V16QI_V16QI),
+  MSA_BUILTIN (maddv_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (maddv_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (maddv_d, MIPS_V2DI_FTYPE_V2DI_V2DI_V2DI),
+  MSA_BUILTIN (msubv_b, MIPS_V16QI_FTYPE_V16QI_V16QI_V16QI),
+  MSA_BUILTIN (msubv_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (msubv_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (msubv_d, MIPS_V2DI_FTYPE_V2DI_V2DI_V2DI),
+  MSA_BUILTIN (div_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (div_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (div_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (div_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (div_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (div_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (div_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (div_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (hadd_s_h, MIPS_V8HI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (hadd_s_w, MIPS_V4SI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (hadd_s_d, MIPS_V2DI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (hadd_u_h, MIPS_UV8HI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (hadd_u_w, MIPS_UV4SI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (hadd_u_d, MIPS_UV2DI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (hsub_s_h, MIPS_V8HI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (hsub_s_w, MIPS_V4SI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (hsub_s_d, MIPS_V2DI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (hsub_u_h, MIPS_V8HI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (hsub_u_w, MIPS_V4SI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (hsub_u_d, MIPS_V2DI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (mod_s_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (mod_s_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (mod_s_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (mod_s_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (mod_u_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (mod_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (mod_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (mod_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV2DI),
+  MSA_BUILTIN (dotp_s_h, MIPS_V8HI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (dotp_s_w, MIPS_V4SI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (dotp_s_d, MIPS_V2DI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (dotp_u_h, MIPS_UV8HI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (dotp_u_w, MIPS_UV4SI_FTYPE_UV8HI_UV8HI),
+  MSA_BUILTIN (dotp_u_d, MIPS_UV2DI_FTYPE_UV4SI_UV4SI),
+  MSA_BUILTIN (dpadd_s_h, MIPS_V8HI_FTYPE_V8HI_V16QI_V16QI),
+  MSA_BUILTIN (dpadd_s_w, MIPS_V4SI_FTYPE_V4SI_V8HI_V8HI),
+  MSA_BUILTIN (dpadd_s_d, MIPS_V2DI_FTYPE_V2DI_V4SI_V4SI),
+  MSA_BUILTIN (dpadd_u_h, MIPS_UV8HI_FTYPE_UV8HI_UV16QI_UV16QI),
+  MSA_BUILTIN (dpadd_u_w, MIPS_UV4SI_FTYPE_UV4SI_UV8HI_UV8HI),
+  MSA_BUILTIN (dpadd_u_d, MIPS_UV2DI_FTYPE_UV2DI_UV4SI_UV4SI),
+  MSA_BUILTIN (dpsub_s_h, MIPS_V8HI_FTYPE_V8HI_V16QI_V16QI),
+  MSA_BUILTIN (dpsub_s_w, MIPS_V4SI_FTYPE_V4SI_V8HI_V8HI),
+  MSA_BUILTIN (dpsub_s_d, MIPS_V2DI_FTYPE_V2DI_V4SI_V4SI),
+  MSA_BUILTIN (dpsub_u_h, MIPS_V8HI_FTYPE_V8HI_UV16QI_UV16QI),
+  MSA_BUILTIN (dpsub_u_w, MIPS_V4SI_FTYPE_V4SI_UV8HI_UV8HI),
+  MSA_BUILTIN (dpsub_u_d, MIPS_V2DI_FTYPE_V2DI_UV4SI_UV4SI),
+  MSA_BUILTIN (sld_b, MIPS_V16QI_FTYPE_V16QI_V16QI_SI),
+  MSA_BUILTIN (sld_h, MIPS_V8HI_FTYPE_V8HI_V8HI_SI),
+  MSA_BUILTIN (sld_w, MIPS_V4SI_FTYPE_V4SI_V4SI_SI),
+  MSA_BUILTIN (sld_d, MIPS_V2DI_FTYPE_V2DI_V2DI_SI),
+  MSA_BUILTIN (sldi_b, MIPS_V16QI_FTYPE_V16QI_V16QI_UQI),
+  MSA_BUILTIN (sldi_h, MIPS_V8HI_FTYPE_V8HI_V8HI_UQI),
+  MSA_BUILTIN (sldi_w, MIPS_V4SI_FTYPE_V4SI_V4SI_UQI),
+  MSA_BUILTIN (sldi_d, MIPS_V2DI_FTYPE_V2DI_V2DI_UQI),
+  MSA_BUILTIN (splat_b, MIPS_V16QI_FTYPE_V16QI_SI),
+  MSA_BUILTIN (splat_h, MIPS_V8HI_FTYPE_V8HI_SI),
+  MSA_BUILTIN (splat_w, MIPS_V4SI_FTYPE_V4SI_SI),
+  MSA_BUILTIN (splat_d, MIPS_V2DI_FTYPE_V2DI_SI),
+  MSA_BUILTIN (splati_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (splati_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (splati_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (splati_d, MIPS_V2DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (pckev_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (pckev_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (pckev_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (pckev_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (pckod_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (pckod_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (pckod_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (pckod_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ilvl_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ilvl_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ilvl_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ilvl_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ilvr_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ilvr_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ilvr_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ilvr_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ilvev_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ilvev_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ilvev_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ilvev_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (ilvod_b, MIPS_V16QI_FTYPE_V16QI_V16QI),
+  MSA_BUILTIN (ilvod_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (ilvod_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (ilvod_d, MIPS_V2DI_FTYPE_V2DI_V2DI),
+  MSA_BUILTIN (vshf_b, MIPS_V16QI_FTYPE_V16QI_V16QI_V16QI),
+  MSA_BUILTIN (vshf_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (vshf_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (vshf_d, MIPS_V2DI_FTYPE_V2DI_V2DI_V2DI),
+  MSA_BUILTIN (and_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (andi_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (or_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (ori_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (nor_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (nori_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (xor_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI),
+  MSA_BUILTIN (xori_b, MIPS_UV16QI_FTYPE_UV16QI_UQI),
+  MSA_BUILTIN (bmnz_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UV16QI),
+  MSA_BUILTIN (bmnzi_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UQI),
+  MSA_BUILTIN (bmz_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UV16QI),
+  MSA_BUILTIN (bmzi_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UQI),
+  MSA_BUILTIN (bsel_v, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UV16QI),
+  MSA_BUILTIN (bseli_b, MIPS_UV16QI_FTYPE_UV16QI_UV16QI_UQI),
+  MSA_BUILTIN (shf_b, MIPS_V16QI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (shf_h, MIPS_V8HI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (shf_w, MIPS_V4SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (bnz_v, MIPS_SI_FTYPE_UV16QI),
+  MSA_BUILTIN (bz_v, MIPS_SI_FTYPE_UV16QI),
+  MSA_BUILTIN (fill_b, MIPS_V16QI_FTYPE_SI),
+  MSA_BUILTIN (fill_h, MIPS_V8HI_FTYPE_SI),
+  MSA_BUILTIN (fill_w, MIPS_V4SI_FTYPE_SI),
+  MSA_BUILTIN (fill_d, MIPS_V2DI_FTYPE_DI),
+  MSA_BUILTIN (pcnt_b, MIPS_V16QI_FTYPE_V16QI),
+  MSA_BUILTIN (pcnt_h, MIPS_V8HI_FTYPE_V8HI),
+  MSA_BUILTIN (pcnt_w, MIPS_V4SI_FTYPE_V4SI),
+  MSA_BUILTIN (pcnt_d, MIPS_V2DI_FTYPE_V2DI),
+  MSA_BUILTIN (nloc_b, MIPS_V16QI_FTYPE_V16QI),
+  MSA_BUILTIN (nloc_h, MIPS_V8HI_FTYPE_V8HI),
+  MSA_BUILTIN (nloc_w, MIPS_V4SI_FTYPE_V4SI),
+  MSA_BUILTIN (nloc_d, MIPS_V2DI_FTYPE_V2DI),
+  MSA_BUILTIN (nlzc_b, MIPS_V16QI_FTYPE_V16QI),
+  MSA_BUILTIN (nlzc_h, MIPS_V8HI_FTYPE_V8HI),
+  MSA_BUILTIN (nlzc_w, MIPS_V4SI_FTYPE_V4SI),
+  MSA_BUILTIN (nlzc_d, MIPS_V2DI_FTYPE_V2DI),
+  MSA_BUILTIN (copy_s_b, MIPS_SI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (copy_s_h, MIPS_SI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (copy_s_w, MIPS_SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (copy_s_d, MIPS_DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (copy_u_b, MIPS_SI_FTYPE_V16QI_UQI),
+  MSA_BUILTIN (copy_u_h, MIPS_SI_FTYPE_V8HI_UQI),
+  MSA_BUILTIN (copy_u_w, MIPS_SI_FTYPE_V4SI_UQI),
+  MSA_BUILTIN (copy_u_d, MIPS_DI_FTYPE_V2DI_UQI),
+  MSA_BUILTIN (insert_b, MIPS_V16QI_FTYPE_V16QI_UQI_SI),
+  MSA_BUILTIN (insert_h, MIPS_V8HI_FTYPE_V8HI_UQI_SI),
+  MSA_BUILTIN (insert_w, MIPS_V4SI_FTYPE_V4SI_UQI_SI),
+  MSA_BUILTIN (insert_d, MIPS_V2DI_FTYPE_V2DI_UQI_DI),
+  MSA_BUILTIN (insve_b, MIPS_V16QI_FTYPE_V16QI_UQI_V16QI),
+  MSA_BUILTIN (insve_h, MIPS_V8HI_FTYPE_V8HI_UQI_V8HI),
+  MSA_BUILTIN (insve_w, MIPS_V4SI_FTYPE_V4SI_UQI_V4SI),
+  MSA_BUILTIN (insve_d, MIPS_V2DI_FTYPE_V2DI_UQI_V2DI),
+  MSA_BUILTIN (bnz_b, MIPS_SI_FTYPE_UV16QI),
+  MSA_BUILTIN (bnz_h, MIPS_SI_FTYPE_UV8HI),
+  MSA_BUILTIN (bnz_w, MIPS_SI_FTYPE_UV4SI),
+  MSA_BUILTIN (bnz_d, MIPS_SI_FTYPE_UV2DI),
+  MSA_BUILTIN (bz_b, MIPS_SI_FTYPE_UV16QI),
+  MSA_BUILTIN (bz_h, MIPS_SI_FTYPE_UV8HI),
+  MSA_BUILTIN (bz_w, MIPS_SI_FTYPE_UV4SI),
+  MSA_BUILTIN (bz_d, MIPS_SI_FTYPE_UV2DI),
+  MSA_BUILTIN (ldi_b, MIPS_V16QI_FTYPE_HI),
+  MSA_BUILTIN (ldi_h, MIPS_V8HI_FTYPE_HI),
+  MSA_BUILTIN (ldi_w, MIPS_V4SI_FTYPE_HI),
+  MSA_BUILTIN (ldi_d, MIPS_V2DI_FTYPE_HI),
+  MSA_BUILTIN (fcaf_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcaf_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcor_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcor_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcun_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcun_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcune_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcune_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcueq_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcueq_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fceq_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fceq_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcne_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcne_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fclt_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fclt_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcult_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcult_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcle_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcle_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fcule_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fcule_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsaf_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsaf_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsor_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsor_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsun_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsun_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsune_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsune_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsueq_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsueq_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fseq_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fseq_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsne_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsne_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fslt_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fslt_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsult_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsult_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsle_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsle_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsule_w, MIPS_V4SI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsule_d, MIPS_V2DI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fadd_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fadd_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fsub_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fsub_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmul_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fmul_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fdiv_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fdiv_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmadd_w, MIPS_V4SF_FTYPE_V4SF_V4SF_V4SF),
+  MSA_BUILTIN (fmadd_d, MIPS_V2DF_FTYPE_V2DF_V2DF_V2DF),
+  MSA_BUILTIN (fmsub_w, MIPS_V4SF_FTYPE_V4SF_V4SF_V4SF),
+  MSA_BUILTIN (fmsub_d, MIPS_V2DF_FTYPE_V2DF_V2DF_V2DF),
+  MSA_BUILTIN (fexp2_w, MIPS_V4SF_FTYPE_V4SF_V4SI),
+  MSA_BUILTIN (fexp2_d, MIPS_V2DF_FTYPE_V2DF_V2DI),
+  MSA_BUILTIN (fexdo_h, MIPS_V8HI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fexdo_w, MIPS_V4SF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (ftq_h, MIPS_V8HI_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (ftq_w, MIPS_V4SI_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmin_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fmin_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmin_a_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fmin_a_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmax_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fmax_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (fmax_a_w, MIPS_V4SF_FTYPE_V4SF_V4SF),
+  MSA_BUILTIN (fmax_a_d, MIPS_V2DF_FTYPE_V2DF_V2DF),
+  MSA_BUILTIN (mul_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (mul_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (mulr_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI),
+  MSA_BUILTIN (mulr_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI),
+  MSA_BUILTIN (madd_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (madd_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (maddr_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (maddr_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (msub_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (msub_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (msubr_q_h, MIPS_V8HI_FTYPE_V8HI_V8HI_V8HI),
+  MSA_BUILTIN (msubr_q_w, MIPS_V4SI_FTYPE_V4SI_V4SI_V4SI),
+  MSA_BUILTIN (fclass_w, MIPS_V4SI_FTYPE_V4SF),
+  MSA_BUILTIN (fclass_d, MIPS_V2DI_FTYPE_V2DF),
+  MSA_BUILTIN (fsqrt_w, MIPS_V4SF_FTYPE_V4SF),
+  MSA_BUILTIN (fsqrt_d, MIPS_V2DF_FTYPE_V2DF),
+  MSA_BUILTIN (frcp_w, MIPS_V4SF_FTYPE_V4SF),
+  MSA_BUILTIN (frcp_d, MIPS_V2DF_FTYPE_V2DF),
+  MSA_BUILTIN (frint_w, MIPS_V4SF_FTYPE_V4SF),
+  MSA_BUILTIN (frint_d, MIPS_V2DF_FTYPE_V2DF),
+  MSA_BUILTIN (frsqrt_w, MIPS_V4SF_FTYPE_V4SF),
+  MSA_BUILTIN (frsqrt_d, MIPS_V2DF_FTYPE_V2DF),
+  MSA_BUILTIN (flog2_w, MIPS_V4SF_FTYPE_V4SF),
+  MSA_BUILTIN (flog2_d, MIPS_V2DF_FTYPE_V2DF),
+  MSA_BUILTIN (fexupl_w, MIPS_V4SF_FTYPE_V8HI),
+  MSA_BUILTIN (fexupl_d, MIPS_V2DF_FTYPE_V4SF),
+  MSA_BUILTIN (fexupr_w, MIPS_V4SF_FTYPE_V8HI),
+  MSA_BUILTIN (fexupr_d, MIPS_V2DF_FTYPE_V4SF),
+  MSA_BUILTIN (ffql_w, MIPS_V4SF_FTYPE_V8HI),
+  MSA_BUILTIN (ffql_d, MIPS_V2DF_FTYPE_V4SI),
+  MSA_BUILTIN (ffqr_w, MIPS_V4SF_FTYPE_V8HI),
+  MSA_BUILTIN (ffqr_d, MIPS_V2DF_FTYPE_V4SI),
+  MSA_BUILTIN (ftint_s_w, MIPS_V4SI_FTYPE_V4SF),
+  MSA_BUILTIN (ftint_s_d, MIPS_V2DI_FTYPE_V2DF),
+  MSA_BUILTIN (ftint_u_w, MIPS_UV4SI_FTYPE_V4SF),
+  MSA_BUILTIN (ftint_u_d, MIPS_UV2DI_FTYPE_V2DF),
+  MSA_BUILTIN (ftrunc_s_w, MIPS_V4SI_FTYPE_V4SF),
+  MSA_BUILTIN (ftrunc_s_d, MIPS_V2DI_FTYPE_V2DF),
+  MSA_BUILTIN (ftrunc_u_w, MIPS_UV4SI_FTYPE_V4SF),
+  MSA_BUILTIN (ftrunc_u_d, MIPS_UV2DI_FTYPE_V2DF),
+  MSA_BUILTIN (ffint_s_w, MIPS_V4SF_FTYPE_V4SI),
+  MSA_BUILTIN (ffint_s_d, MIPS_V2DF_FTYPE_V2DI),
+  MSA_BUILTIN (ffint_u_w, MIPS_V4SF_FTYPE_UV4SI),
+  MSA_BUILTIN (ffint_u_d, MIPS_V2DF_FTYPE_UV2DI),
+  MSA_NO_TARGET_BUILTIN (ctcmsa, MIPS_VOID_FTYPE_UQI_SI),
+  MSA_BUILTIN (cfcmsa, MIPS_SI_FTYPE_UQI),
+  MSA_BUILTIN (move_v, MIPS_V16QI_FTYPE_V16QI),
+  MSA_BUILTIN (cast_to_vector_float, MIPS_V4SF_FTYPE_SF),
+  MSA_BUILTIN (cast_to_vector_double, MIPS_V2DF_FTYPE_DF),
+  MSA_BUILTIN (cast_to_scalar_float, MIPS_SF_FTYPE_V4SF),
+  MSA_BUILTIN (cast_to_scalar_double, MIPS_DF_FTYPE_V2DF)
 };
 
 /* Index I is the function declaration for mips_builtins[I], or null if the
    function isn't defined on this target.  */
 static GTY(()) tree mips_builtin_decls[ARRAY_SIZE (mips_builtins)];
+/* Get the index I of the function declaration for mips_builtin_decls[I]
+   using the instruction code or return null if not defined for the target.  */
+static GTY(()) int mips_get_builtin_decl_index[LAST_INSN_CODE];
 
 /* MODE is a vector mode whose elements have type TYPE.  Return the type
    of the vector itself.  */
 
 static tree
-mips_builtin_vector_type (tree type, enum machine_mode mode)
+mips_builtin_vector_type (tree type, machine_mode mode)
 {
   static tree types[2 * (int) MAX_MACHINE_MODE];
   int mode_index;
@@ -14271,7 +16431,9 @@ mips_build_cvpointer_type (void)
 #define MIPS_ATYPE_CVPOINTER mips_build_cvpointer_type ()
 
 /* Standard mode-based argument types.  */
+#define MIPS_ATYPE_QI intQI_type_node
 #define MIPS_ATYPE_UQI unsigned_intQI_type_node
+#define MIPS_ATYPE_HI intHI_type_node
 #define MIPS_ATYPE_SI intSI_type_node
 #define MIPS_ATYPE_USI unsigned_intSI_type_node
 #define MIPS_ATYPE_DI intDI_type_node
@@ -14286,6 +16448,18 @@ mips_build_cvpointer_type (void)
 #define MIPS_ATYPE_V4QI mips_builtin_vector_type (intQI_type_node, V4QImode)
 #define MIPS_ATYPE_V4HI mips_builtin_vector_type (intHI_type_node, V4HImode)
 #define MIPS_ATYPE_V8QI mips_builtin_vector_type (intQI_type_node, V8QImode)
+#define MIPS_ATYPE_V2DI mips_builtin_vector_type (intDI_type_node, V2DImode)
+#define MIPS_ATYPE_V4SI mips_builtin_vector_type (intSI_type_node, V4SImode)
+#define MIPS_ATYPE_V8HI mips_builtin_vector_type (intHI_type_node, V8HImode)
+#define MIPS_ATYPE_V16QI mips_builtin_vector_type (intQI_type_node, V16QImode)
+#define MIPS_ATYPE_V2DF mips_builtin_vector_type (double_type_node, V2DFmode)
+#define MIPS_ATYPE_V4SF mips_builtin_vector_type (float_type_node, V4SFmode)
+
+#define MIPS_ATYPE_UV2DI mips_builtin_vector_type (unsigned_intDI_type_node, V2DImode)
+#define MIPS_ATYPE_UV4SI mips_builtin_vector_type (unsigned_intSI_type_node, V4SImode)
+#define MIPS_ATYPE_UV8HI mips_builtin_vector_type (unsigned_intHI_type_node, V8HImode)
+#define MIPS_ATYPE_UV16QI mips_builtin_vector_type (unsigned_intQI_type_node, V16QImode)
+
 #define MIPS_ATYPE_UV2SI					\
   mips_builtin_vector_type (unsigned_intSI_type_node, V2SImode)
 #define MIPS_ATYPE_UV4HI					\
@@ -14347,10 +16521,13 @@ mips_init_builtins (void)
     {
       d = &mips_builtins[i];
       if (d->avail ())
-	mips_builtin_decls[i]
-	  = add_builtin_function (d->name,
-				  mips_build_function_type (d->function_type),
-				  i, BUILT_IN_MD, NULL, NULL);
+	{
+	  mips_builtin_decls[i]
+	    = add_builtin_function (d->name,
+				    mips_build_function_type (d->function_type),
+				    i, BUILT_IN_MD, NULL, NULL);
+	  mips_get_builtin_decl_index[d->icode] = i;
+	}
     }
 }
 
@@ -14364,6 +16541,51 @@ mips_builtin_decl (unsigned int code, bool initialize_p ATTRIBUTE_UNUSED)
   return mips_builtin_decls[code];
 }
 
+/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION.  */
+
+static tree
+mips_builtin_vectorized_function (tree fndecl, tree type_out,
+				  tree type_in)
+{
+  machine_mode in_mode, out_mode;
+  enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
+  int in_n, out_n;
+
+  if (TREE_CODE (type_out) != VECTOR_TYPE
+      || TREE_CODE (type_in) != VECTOR_TYPE
+      || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL
+      || !ISA_HAS_MSA)
+    return NULL_TREE;
+
+  out_mode = TYPE_MODE (TREE_TYPE (type_out));
+  out_n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+
+  /* INSN is the name of the associated instruction pattern, without
+     the leading CODE_FOR_.  */
+#define MIPS_GET_BUILTIN(INSN) \
+  mips_builtin_decls[mips_get_builtin_decl_index[CODE_FOR_##INSN]]
+
+  switch (fn)
+    {
+    case BUILT_IN_SQRT:
+      if (out_mode == DFmode && out_n == 2
+	  && in_mode == DFmode && in_n == 2)
+	return MIPS_GET_BUILTIN (msa_fsqrt_d);
+      break;
+    case BUILT_IN_SQRTF:
+      if (out_mode == SFmode && out_n == 4
+	  && in_mode == SFmode && in_n == 4)
+	return MIPS_GET_BUILTIN (msa_fsqrt_w);
+      break;
+    default:
+      break;
+    }
+
+  return NULL_TREE;
+}
+
 /* Take argument ARGNO from EXP's argument list and convert it into
    an expand operand.  Store the operand in *OP.  */
 
@@ -14579,7 +16801,7 @@ mips_expand_builtin_bposge (enum mips_builtin_type builtin_type, rtx target)
 
 static rtx
 mips_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
-		     enum machine_mode mode, int ignore)
+		     machine_mode mode, int ignore)
 {
   tree fndecl;
   unsigned int fcode, avail;
@@ -14630,7 +16852,7 @@ struct mips16_constant {
   struct mips16_constant *next;
   rtx value;
   rtx label;
-  enum machine_mode mode;
+  machine_mode mode;
 };
 
 /* Information about an incomplete MIPS16 constant pool.  FIRST is the
@@ -14648,7 +16870,7 @@ struct mips16_constant_pool {
 
 static rtx
 mips16_add_constant (struct mips16_constant_pool *pool,
-		     rtx value, enum machine_mode mode)
+		     rtx value, machine_mode mode)
 {
   struct mips16_constant **p, *c;
   bool first_of_size_p;
@@ -14704,7 +16926,7 @@ mips16_add_constant (struct mips16_constant_pool *pool,
    instruction emitted.  MODE is the mode of the constant.  */
 
 static rtx
-mips16_emit_constants_1 (enum machine_mode mode, rtx value, rtx insn)
+mips16_emit_constants_1 (machine_mode mode, rtx value, rtx insn)
 {
   if (SCALAR_INT_MODE_P (mode) || ALL_SCALAR_FIXED_POINT_MODE_P (mode))
     {
@@ -15074,7 +17296,7 @@ r10k_safe_mem_expr_p (tree expr, unsigned HOST_WIDE_INT offset)
 {
   HOST_WIDE_INT bitoffset, bitsize;
   tree inner, var_offset;
-  enum machine_mode mode;
+  machine_mode mode;
   int unsigned_p, volatile_p;
 
   inner = get_inner_reference (expr, &bitsize, &bitoffset, &var_offset, &mode,
@@ -15742,7 +17964,7 @@ mips_mult_zero_zero_cost (struct mips_sim *state, bool setting)
   mips_tuning_info.fast_mult_zero_zero_p = setting;
   start_sequence ();
 
-  enum machine_mode dword_mode = TARGET_64BIT ? TImode : DImode;
+  machine_mode dword_mode = TARGET_64BIT ? TImode : DImode;
   rtx hilo = gen_rtx_REG (dword_mode, MD_REG_FIRST);
   mips_emit_move_or_split (hilo, const0_rtx, SPLIT_FOR_SPEED);
 
@@ -15766,8 +17988,10 @@ mips_mult_zero_zero_cost (struct mips_sim *state, bool setting)
 static void
 mips_set_fast_mult_zero_zero_p (struct mips_sim *state)
 {
-  if (TARGET_MIPS16)
-    /* No MTLO or MTHI available.  */
+  if (TARGET_MIPS16 || (!ISA_HAS_HILO && !TARGET_DSP))
+    /* No MTLO or MTHI available for MIPS16. Also, when there are no HI or LO
+       registers then there is no reason to zero them, arbitrarily choose to
+       say that "MULT $0,$0" would be faster.  */
     mips_tuning_info.fast_mult_zero_zero_p = true;
   else
     {
@@ -16124,7 +18348,7 @@ mips_orphaned_high_part_p (mips_offset_table htab, rtx insn)
 
 static void
 mips_avoid_hazard (rtx after, rtx insn, int *hilo_delay,
-		   rtx *delayed_reg, rtx lo_reg)
+		   rtx *delayed_reg, rtx lo_reg, bool *fs_delay)
 {
   rtx pattern, set;
   int nops, ninsns;
@@ -16150,6 +18374,15 @@ mips_avoid_hazard (rtx after, rtx insn, int *hilo_delay,
     nops = 2 - *hilo_delay;
   else if (*delayed_reg != 0 && reg_referenced_p (*delayed_reg, pattern))
     nops = 1;
+  /* If processing a forbidden slot hazard then a NOP is required if the
+     branch instruction was not in a sequence (as the sequence would
+     imply it is not actually a compact branch anyway) and the current
+     insn is not an inline asm, and can't go in a delay slot.  */
+  else if (*fs_delay && get_attr_can_delay (insn) == CAN_DELAY_NO
+	   && GET_CODE (PATTERN (after)) != SEQUENCE
+	   && GET_CODE (pattern) != ASM_INPUT
+	   && asm_noperands (pattern) < 0)
+    nops = 1;
   else
     nops = 0;
 
@@ -16162,12 +18395,18 @@ mips_avoid_hazard (rtx after, rtx insn, int *hilo_delay,
   /* Set up the state for the next instruction.  */
   *hilo_delay += ninsns;
   *delayed_reg = 0;
+  *fs_delay = false;
   if (INSN_CODE (insn) >= 0)
     switch (get_attr_hazard (insn))
       {
       case HAZARD_NONE:
 	break;
 
+      case HAZARD_FORBIDDEN_SLOT:
+	if (TARGET_CB_MAYBE)
+	  *fs_delay = true;
+	break;
+
       case HAZARD_HILO:
 	*hilo_delay = 0;
 	break;
@@ -16191,6 +18430,7 @@ mips_reorg_process_insns (void)
   rtx insn, last_insn, subinsn, next_insn, lo_reg, delayed_reg;
   int hilo_delay;
   mips_offset_table htab;
+  bool fs_delay;
 
   /* Force all instructions to be split into their final form.  */
   split_all_insns_noflow ();
@@ -16259,6 +18499,7 @@ mips_reorg_process_insns (void)
   hilo_delay = 2;
   delayed_reg = 0;
   lo_reg = gen_rtx_REG (SImode, LO_REGNUM);
+  fs_delay = false;
 
   /* Make a second pass over the instructions.  Delete orphaned
      high-part relocations or turn them into NOPs.  Avoid hazards
@@ -16282,7 +18523,7 @@ mips_reorg_process_insns (void)
 			INSN_CODE (subinsn) = CODE_FOR_nop;
 		      }
 		    mips_avoid_hazard (last_insn, subinsn, &hilo_delay,
-				       &delayed_reg, lo_reg);
+				       &delayed_reg, lo_reg, &fs_delay);
 		  }
 	      last_insn = insn;
 	    }
@@ -16303,7 +18544,7 @@ mips_reorg_process_insns (void)
 	      else
 		{
 		  mips_avoid_hazard (last_insn, insn, &hilo_delay,
-				     &delayed_reg, lo_reg);
+				     &delayed_reg, lo_reg, &fs_delay);
 		  last_insn = insn;
 		}
 	    }
@@ -16768,6 +19009,9 @@ mips_set_compression_mode (unsigned int compression_mode)
 
       if (TARGET_HARD_FLOAT_ABI && !TARGET_OLDABI)
 	sorry ("hard-float MIPS16 code for ABIs other than o32 and o64");
+
+      if (TARGET_MSA)
+	sorry ("MSA MIPS16 code");
     }
   else
     {
@@ -16900,6 +19144,10 @@ mips_set_architecture (const struct mips_cpu_info *info)
       mips_arch_info = info;
       mips_arch = info->cpu;
       mips_isa = info->isa;
+      if (mips_isa < 32)
+	mips_isa_rev = 0;
+      else
+	mips_isa_rev = (mips_isa & 31) + 1;
     }
 }
 
@@ -17009,7 +19257,10 @@ mips_option_override (void)
 
   if ((target_flags_explicit & MASK_FLOAT64) != 0)
     {
-      if (TARGET_SINGLE_FLOAT && TARGET_FLOAT64)
+      if (mips_isa_rev >= 6 && !TARGET_FLOAT64)
+	error ("the %qs architecture does not support %<-mfp32%>",
+	       mips_arch_info->name);
+      else if (TARGET_SINGLE_FLOAT && TARGET_FLOAT64)
 	error ("unsupported combination: %s", "-mfp64 -msingle-float");
       else if (TARGET_64BIT && TARGET_DOUBLE_FLOAT && !TARGET_FLOAT64)
 	error ("unsupported combination: %s", "-mgp64 -mfp32 -mdouble-float");
@@ -17025,14 +19276,30 @@ mips_option_override (void)
     }
   else
     {
-      /* -msingle-float selects 32-bit float registers.  Otherwise the
-	 float registers should be the same size as the integer ones.  */
-      if (TARGET_64BIT && TARGET_DOUBLE_FLOAT)
+      /* -msingle-float selects 32-bit float registers.  On r6 and later,
+	 -mdouble-float selects 64-bit float registers, since the old paired
+	 register model is not supported.  -mmsa selects 64-bit registers for
+	 O32.  In other cases the float registers should be the same size as
+	 the integer ones.  */
+      if (mips_isa_rev >= 6 && TARGET_DOUBLE_FLOAT && !TARGET_FLOATXX)
+	target_flags |= MASK_FLOAT64;
+      else if (TARGET_64BIT && TARGET_DOUBLE_FLOAT)
+	target_flags |= MASK_FLOAT64;
+      else if (mips_abi == ABI_32 && TARGET_MSA && !TARGET_FLOATXX)
 	target_flags |= MASK_FLOAT64;
       else
 	target_flags &= ~MASK_FLOAT64;
     }
 
+  if (mips_abi != ABI_32 && TARGET_FLOATXX)
+    error ("%<-mfpxx%> can only be used with the o32 ABI");
+  else if (TARGET_FLOAT64 && TARGET_FLOATXX)
+    error ("unsupported combination: %s", "-mfp64 -mfpxx");
+  else if (ISA_MIPS1 && !TARGET_FLOAT32)
+    error ("%<-march=%s%> requires %<-mfp32%>", mips_arch_info->name);
+  else if (TARGET_FLOATXX && !mips_lra_flag)
+    error ("%<-mfpxx%> requires %<-mlra%>");
+
   /* End of code shared with GAS.  */
 
   /* The R5900 FPU only supports single precision.  */
@@ -17120,6 +19387,49 @@ mips_option_override (void)
     warning (0, "the %qs architecture does not support madd or msub"
 	     " instructions", mips_arch_info->name);
 
+  /* If neither -modd-spreg nor -mno-odd-spreg was given on the command
+     line, set MASK_ODD_SPREG based on the ISA and ABI.  */
+  if ((target_flags_explicit & MASK_ODD_SPREG) == 0)
+    {
+      /* Disable TARGET_ODD_SPREG when using the o32 FPXX ABI.  */
+      if (!ISA_HAS_ODD_SPREG || TARGET_FLOATXX)
+	target_flags &= ~MASK_ODD_SPREG;
+      else
+	target_flags |= MASK_ODD_SPREG;
+    }
+  else if (TARGET_ODD_SPREG && !ISA_HAS_ODD_SPREG)
+    warning (0, "the %qs architecture does not support odd single-precision"
+	     " registers", mips_arch_info->name);
+
+  if (!TARGET_ODD_SPREG && TARGET_64BIT)
+    {
+      error ("unsupported combination: %s", "-mgp64 -mno-odd-spreg");
+      /* Allow compilation to continue further even though invalid output
+         will be produced.  */
+      target_flags |= MASK_ODD_SPREG;
+    }
+
+  if (!ISA_HAS_COMPACT_BRANCHES && mips_cb == MIPS_CB_ALWAYS)
+    {
+      error ("unsupported combination: %qs%s %s",
+	      mips_arch_info->name, TARGET_MICROMIPS ? " -mmicromips" : "",
+	      "-mcompact-branches=always");
+    }
+  else if (!ISA_HAS_DELAY_SLOTS && mips_cb == MIPS_CB_NEVER)
+    {
+      error ("unsupported combination: %qs%s %s",
+	      mips_arch_info->name, TARGET_MICROMIPS ? " -mmicromips" : "",
+	      "-mcompact-branches=never");
+    }
+
+  /* Require explicit relocs for MIPS R6 onwards.  This enables simplification
+     of the compact branch and jump support through the backend.  */
+  if (!TARGET_EXPLICIT_RELOCS && mips_isa_rev >= 6)
+    {
+      error ("unsupported combination: %qs %s",
+	     mips_arch_info->name, "-mno-explicit-relocs");
+    }
+
   /* The effect of -mabicalls isn't defined for the EABI.  */
   if (mips_abi == ABI_EABI && TARGET_ABICALLS)
     {
@@ -17183,6 +19493,27 @@ mips_option_override (void)
 	}
     }
 
+  /* Set NaN and ABS defaults.  */
+  if (mips_nan == MIPS_IEEE_754_DEFAULT && !ISA_HAS_IEEE_754_LEGACY)
+    mips_nan = MIPS_IEEE_754_2008;
+  if (mips_abs == MIPS_IEEE_754_DEFAULT && !ISA_HAS_IEEE_754_LEGACY)
+    mips_abs = MIPS_IEEE_754_2008;
+
+  /* Check for IEEE 754 legacy/2008 support.  */
+  if ((mips_nan == MIPS_IEEE_754_LEGACY
+       || mips_abs == MIPS_IEEE_754_LEGACY)
+      && !ISA_HAS_IEEE_754_LEGACY)
+    warning (0, "the %qs architecture does not support %<-m%s=legacy%>",
+	     mips_arch_info->name,
+	     mips_nan == MIPS_IEEE_754_LEGACY ? "nan" : "abs");
+
+  if ((mips_nan == MIPS_IEEE_754_2008
+       || mips_abs == MIPS_IEEE_754_2008)
+      && !ISA_HAS_IEEE_754_2008)
+    warning (0, "the %qs architecture does not support %<-m%s=2008%>",
+	     mips_arch_info->name,
+	     mips_nan == MIPS_IEEE_754_2008 ? "nan" : "abs");
+
   /* Pre-IEEE 754-2008 MIPS hardware has a quirky almost-IEEE format
      for all its floating point.  */
   if (mips_nan != MIPS_IEEE_754_2008)
@@ -17214,6 +19545,11 @@ mips_option_override (void)
       TARGET_MIPS3D = 0;
     }
 
+  /* Make sure that when TARGET_MSA is true, TARGET_FLOAT64 and
+     TARGET_HARD_FLOAT_ABI and  both true.  */
+  if (TARGET_MSA && !(TARGET_FLOAT64 && TARGET_HARD_FLOAT_ABI))
+    error ("%<-mmsa%> must be used with %<-mfp64%> and %<-mhard-float%>");
+
   /* Make sure that -mpaired-single is only used on ISAs that support it.
      We must disable it otherwise since it relies on other ISA properties
      like ISA_HAS_8CC having their normal values.  */
@@ -17237,6 +19573,14 @@ mips_option_override (void)
   if (TARGET_DSPR2)
     TARGET_DSP = true;
 
+  if (TARGET_DSPR3)
+    {
+      TARGET_DSP = true;
+      TARGET_DSPR2 = true;
+    }
+
+
+
   /* .eh_frame addresses should be the same width as a C pointer.
      Most MIPS ABIs support only one pointer size, so the assembler
      will usually know exactly how big an .eh_frame address is.
@@ -17300,7 +19644,7 @@ mips_option_override (void)
   for (mode = 0; mode < MAX_MACHINE_MODE; mode++)
     for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
       mips_hard_regno_mode_ok[mode][regno]
-	= mips_hard_regno_mode_ok_p (regno, (enum machine_mode) mode);
+	= mips_hard_regno_mode_ok_p (regno, (machine_mode) mode);
 
   /* Function to allocate machine-dependent function status.  */
   init_machine_status = &mips_init_machine_status;
@@ -17417,6 +19761,10 @@ mips_conditional_register_usage (void)
     AND_COMPL_HARD_REG_SET (accessible_reg_set,
 			    reg_class_contents[(int) DSP_ACC_REGS]);
 
+  if (!ISA_HAS_HILO && !ISA_HAS_DSP)
+    AND_COMPL_HARD_REG_SET (accessible_reg_set,
+			    reg_class_contents[(int) MD_REGS]);
+
   if (!TARGET_HARD_FLOAT)
     {
       AND_COMPL_HARD_REG_SET (accessible_reg_set,
@@ -17431,7 +19779,8 @@ mips_conditional_register_usage (void)
 	 RTL that refers directly to ST_REG_FIRST.  */
       AND_COMPL_HARD_REG_SET (accessible_reg_set,
 			      reg_class_contents[(int) ST_REGS]);
-      SET_HARD_REG_BIT (accessible_reg_set, FPSW_REGNUM);
+      if (!ISA_HAS_CCF)
+	SET_HARD_REG_BIT (accessible_reg_set, FPSW_REGNUM);
       fixed_regs[FPSW_REGNUM] = call_used_regs[FPSW_REGNUM] = 1;
     }
   if (TARGET_MIPS16)
@@ -17482,8 +19831,10 @@ mips_conditional_register_usage (void)
 	call_really_used_regs[regno] = call_used_regs[regno] = 1;
     }
   /* Odd registers in the range $f21-$f31 (inclusive) are call-clobbered
-     for n32.  */
-  if (mips_abi == ABI_N32)
+     for n32 and o32 FP64.  */
+  if (mips_abi == ABI_N32
+      || (mips_abi == ABI_32
+          && TARGET_FLOAT64))
     {
       int regno;
       for (regno = FP_REG_FIRST + 21; regno <= FP_REG_FIRST + 31; regno+=2)
@@ -17501,28 +19852,6 @@ mips_conditional_register_usage (void)
     }
 }
 
-/* When generating MIPS16 code, we want to allocate $24 (T_REG) before
-   other registers for instructions for which it is possible.  This
-   encourages the compiler to use CMP in cases where an XOR would
-   require some register shuffling.  */
-
-void
-mips_order_regs_for_local_alloc (void)
-{
-  int i;
-
-  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-    reg_alloc_order[i] = i;
-
-  if (TARGET_MIPS16)
-    {
-      /* It really doesn't matter where we put register 0, since it is
-         a fixed register anyhow.  */
-      reg_alloc_order[0] = 24;
-      reg_alloc_order[24] = 0;
-    }
-}
-
 /* Implement EH_USES.  */
 
 bool
@@ -17632,6 +19961,8 @@ mips_mulsidi3_gen_fn (enum rtx_code ext_code)
 	 the extension is not needed for signed multiplication.  In order to
 	 ensure that we always remove the redundant sign-extension in this
 	 case we still expand mulsidi3 for DMUL.  */
+      if (ISA_HAS_R6DMUL)
+	return signed_p ? gen_mulsidi3_64bit_r6dmul : NULL;
       if (ISA_HAS_DMUL3)
 	return signed_p ? gen_mulsidi3_64bit_dmul : NULL;
       if (TARGET_MIPS16)
@@ -17644,6 +19975,8 @@ mips_mulsidi3_gen_fn (enum rtx_code ext_code)
     }
   else
     {
+      if (ISA_HAS_R6MUL)
+	return (signed_p ? gen_mulsidi3_32bit_r6 : gen_umulsidi3_32bit_r6);
       if (TARGET_MIPS16)
 	return (signed_p
 		? gen_mulsidi3_32bit_mips16
@@ -17809,6 +20142,64 @@ umips_load_store_pair_p_1 (bool load_p, bool swap_p,
   return true;
 }
 
+bool
+mips_load_store_bonding_p (rtx *operands, machine_mode mode, bool load_p)
+{
+  rtx reg1, reg2, mem1, mem2, base1, base2;
+  enum reg_class rc1, rc2;
+  HOST_WIDE_INT offset1, offset2;
+
+  if (load_p)
+    {
+      reg1 = operands[0];
+      reg2 = operands[2];
+      mem1 = operands[1];
+      mem2 = operands[3];
+    }
+  else
+    {
+      reg1 = operands[1];
+      reg2 = operands[3];
+      mem1 = operands[0];
+      mem2 = operands[2];
+    }
+
+  if (!mips_address_insns (XEXP (mem1, 0), mode, false)
+      || !mips_address_insns (XEXP (mem2, 0), mode, false))
+    return false;
+
+  mips_split_plus (XEXP (mem1, 0), &base1, &offset1);
+  mips_split_plus (XEXP (mem2, 0), &base2, &offset2);
+
+  /* Base regs do not match.  */
+  if (!REG_P (base1) || !rtx_equal_p (base1, base2))
+    return false;
+
+  /* Either of the loads is clobbering base register.  */
+  if (load_p
+      && (REGNO (reg1) == REGNO (base1)
+          || (REGNO (reg2) == REGNO (base1))))
+    return false;
+
+  /* Loading in same registers.  */
+  if (load_p
+      && REGNO (reg1) == REGNO (reg2))
+    return false;
+
+  /* The loads/stores are not of same type.  */
+  rc1 = REGNO_REG_CLASS (REGNO (reg1));
+  rc2 = REGNO_REG_CLASS (REGNO (reg2));
+  if (rc1 != rc2
+      && !reg_class_subset_p (rc1, rc2)
+      && !reg_class_subset_p (rc2, rc1))
+    return false;
+
+  if (abs(offset1 - offset2) != GET_MODE_SIZE (mode))
+    return false;
+
+  return true;
+}
+
 /* OPERANDS describes the operands to a pair of SETs, in the order
    dest1, src1, dest2, src2.  Return true if the operands can be used
    in an LWP or SWP instruction; LOAD_P says which.  */
@@ -18067,6 +20458,18 @@ mips_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
 
 #undef OP
 
+  /* If we are using compact branches we don't have delay slots so
+     place the instruction that was in the delay slot before the JRC
+     instruction.  */
+
+  if (TARGET_CB_ALWAYS)
+    {
+      rtx temp;
+      temp = trampoline[i-2];
+      trampoline[i-2] = trampoline[i-1];
+      trampoline[i-1] = temp;
+    }
+
   /* Copy the trampoline code.  Leave any padding uninitialized.  */
   for (j = 0; j < i; j++)
     {
@@ -18145,7 +20548,7 @@ void mips_function_profiler (FILE *file)
    when TARGET_LOONGSON_VECTORS is true.  */
 
 static unsigned HOST_WIDE_INT
-mips_shift_truncation_mask (enum machine_mode mode)
+mips_shift_truncation_mask (machine_mode mode)
 {
   if (TARGET_LOONGSON_VECTORS && VECTOR_MODE_P (mode))
     return 0;
@@ -18184,13 +20587,13 @@ mips_prepare_pch_save (void)
 
 /* Generate or test for an insn that supports a constant permutation.  */
 
-#define MAX_VECT_LEN 8
+#define MAX_VECT_LEN 16
 
 struct expand_vec_perm_d
 {
   rtx target, op0, op1;
   unsigned char perm[MAX_VECT_LEN];
-  enum machine_mode vmode;
+  machine_mode vmode;
   unsigned char nelt;
   bool one_vector_p;
   bool testing_p;
@@ -18228,7 +20631,7 @@ static bool
 mips_expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
 			     const unsigned char *perm, unsigned nelt)
 {
-  enum machine_mode v2mode;
+  machine_mode v2mode;
   rtx x;
 
   v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
@@ -18499,10 +20902,21 @@ mips_expand_vec_perm_const (rtx operands[4])
   return ok;
 }
 
+/* Implement TARGET_SCHED_REASSOCIATION_WIDTH.  */
+
+static int
+mips_sched_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
+			       machine_mode mode)
+{
+  if (MSA_SUPPORTED_MODE_P (mode))
+    return 2;
+  return 1;
+}
+
 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
 
 static bool
-mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+mips_vectorize_vec_perm_const_ok (machine_mode vmode,
 				  const unsigned char *sel)
 {
   struct expand_vec_perm_d d;
@@ -18547,11 +20961,66 @@ mips_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 void
 mips_expand_vec_unpack (rtx operands[2], bool unsigned_p, bool high_p)
 {
-  enum machine_mode imode = GET_MODE (operands[1]);
+  machine_mode imode = GET_MODE (operands[1]);
   rtx (*unpack) (rtx, rtx, rtx);
-  rtx (*cmpgt) (rtx, rtx, rtx);
+  rtx (*cmpFunc) (rtx, rtx, rtx);
   rtx tmp, dest, zero;
 
+  if (ISA_HAS_MSA)
+  {
+    switch (imode)
+      {
+      case V4SImode:
+	if (BYTES_BIG_ENDIAN != high_p)
+	  unpack = gen_msa_ilvl_w;
+	else
+	  unpack = gen_msa_ilvr_w;
+
+	cmpFunc = gen_msa_clti_si_w;
+	break;
+
+      case V8HImode:
+	if (BYTES_BIG_ENDIAN != high_p)
+	  unpack = gen_msa_ilvl_h;
+	else
+	  unpack = gen_msa_ilvr_h;
+
+	cmpFunc = gen_msa_clti_si_h;
+	break;
+
+      case V16QImode:
+	if (BYTES_BIG_ENDIAN != high_p)
+	  unpack = gen_msa_ilvl_b;
+	else
+	  unpack = gen_msa_ilvr_b;
+
+	cmpFunc = gen_msa_clti_si_b;
+	break;
+
+      default:
+	gcc_unreachable ();
+	break;
+      }
+
+    if (!unsigned_p)
+    {
+      /* Extract sign extention for each element comparing each element with
+	 immediate zero.  */
+      tmp = gen_reg_rtx (imode);
+      emit_insn (cmpFunc (tmp, operands[1], CONST0_RTX (imode)));
+    }
+    else
+    {
+      tmp = force_reg (imode, CONST0_RTX (imode));
+    }
+
+    dest = gen_reg_rtx (imode);
+
+    emit_insn (unpack (dest, tmp, operands[1]));
+    emit_move_insn (operands[0], gen_lowpart (GET_MODE (operands[0]), dest));
+    return;
+  }
+
   switch (imode)
     {
     case V8QImode:
@@ -18559,14 +21028,14 @@ mips_expand_vec_unpack (rtx operands[2], bool unsigned_p, bool high_p)
 	unpack = gen_loongson_punpckhbh;
       else
 	unpack = gen_loongson_punpcklbh;
-      cmpgt = gen_loongson_pcmpgtb;
+      cmpFunc = gen_loongson_pcmpgtb;
       break;
     case V4HImode:
       if (high_p)
 	unpack = gen_loongson_punpckhhw;
       else
 	unpack = gen_loongson_punpcklhw;
-      cmpgt = gen_loongson_pcmpgth;
+      cmpFunc = gen_loongson_pcmpgth;
       break;
     default:
       gcc_unreachable ();
@@ -18578,7 +21047,7 @@ mips_expand_vec_unpack (rtx operands[2], bool unsigned_p, bool high_p)
   else
     {
       tmp = gen_reg_rtx (imode);
-      emit_insn (cmpgt (tmp, zero, operands[1]));
+      emit_insn (cmpFunc (tmp, zero, operands[1]));
     }
 
   dest = gen_reg_rtx (imode);
@@ -18598,7 +21067,7 @@ mips_constant_elt_p (rtx x)
 /* A subroutine of mips_expand_vec_init, expand via broadcast.  */
 
 static void
-mips_expand_vi_broadcast (enum machine_mode vmode, rtx target, rtx elt)
+mips_expand_vi_broadcast (machine_mode vmode, rtx target, rtx elt)
 {
   struct expand_vec_perm_d d;
   rtx t1;
@@ -18638,7 +21107,7 @@ mips_expand_vi_broadcast (enum machine_mode vmode, rtx target, rtx elt)
    elements of VALS with zeros, copy the constant vector to TARGET.  */
 
 static void
-mips_expand_vi_constant (enum machine_mode vmode, unsigned nelt,
+mips_expand_vi_constant (machine_mode vmode, unsigned nelt,
 			 rtx target, rtx vals)
 {
   rtvec vec = shallow_copy_rtvec (XVEC (vals, 0));
@@ -18646,8 +21115,9 @@ mips_expand_vi_constant (enum machine_mode vmode, unsigned nelt,
 
   for (i = 0; i < nelt; ++i)
     {
-      if (!mips_constant_elt_p (RTVEC_ELT (vec, i)))
-	RTVEC_ELT (vec, i) = const0_rtx;
+      rtx elem = RTVEC_ELT (vec, i);
+      if (!mips_constant_elt_p (elem))
+	RTVEC_ELT (vec, i) = CONST0_RTX (GET_MODE (elem));
     }
 
   emit_move_insn (target, gen_rtx_CONST_VECTOR (vmode, vec));
@@ -18668,7 +21138,7 @@ mips_expand_vi_loongson_one_pinsrh (rtx target, rtx vals, unsigned one_var)
 /* A subroutine of mips_expand_vec_init, expand anything via memory.  */
 
 static void
-mips_expand_vi_general (enum machine_mode vmode, enum machine_mode imode,
+mips_expand_vi_general (machine_mode vmode, machine_mode imode,
 			unsigned nelt, unsigned nvar, rtx target, rtx vals)
 {
   rtx mem = assign_stack_temp (vmode, GET_MODE_SIZE (vmode));
@@ -18692,8 +21162,8 @@ mips_expand_vi_general (enum machine_mode vmode, enum machine_mode imode,
 void
 mips_expand_vector_init (rtx target, rtx vals)
 {
-  enum machine_mode vmode = GET_MODE (target);
-  enum machine_mode imode = GET_MODE_INNER (vmode);
+  machine_mode vmode = GET_MODE (target);
+  machine_mode imode = GET_MODE_INNER (vmode);
   unsigned i, nelt = GET_MODE_NUNITS (vmode);
   unsigned nvar = 0, one_var = -1u;
   bool all_same = true;
@@ -18708,6 +21178,129 @@ mips_expand_vector_init (rtx target, rtx vals)
 	all_same = false;
     }
 
+  if (TARGET_MSA)
+    {
+      if (all_same)
+	{
+	  rtx same = XVECEXP (vals, 0, 0);
+	  rtx temp, temp2;
+
+	  if (CONST_INT_P (same) && nvar == 0 && mips_signed_immediate_p (INTVAL (same), 10, 0))
+	    {
+	      switch (vmode)
+		{
+		case V16QImode:
+		  emit_insn (gen_msa_ldiv16qi (target, same));
+		  return;
+
+		case V8HImode:
+		  emit_insn (gen_msa_ldiv8hi (target, same));
+		  return;
+
+		case V4SImode:
+		  emit_insn (gen_msa_ldiv4si (target, same));
+		  return;
+
+		case V2DImode:
+		  emit_insn (gen_msa_ldiv2di (target, same));
+		  return;
+
+		default:
+		  break;
+		}
+	    }
+	  temp = gen_reg_rtx (imode);
+	  if (imode == GET_MODE (same))
+	    temp2 = same;
+	  else if (GET_MODE_SIZE (imode) >= UNITS_PER_WORD)
+	    temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), 0);
+	  else
+	    {
+	      unsigned offset = 0;
+
+	      if (TARGET_BIG_ENDIAN)
+		offset = GET_MODE_SIZE (GET_MODE (same)) - GET_MODE_SIZE (imode);
+	      temp2 = simplify_gen_subreg (imode, same, GET_MODE (same), offset);
+	    }
+	  emit_move_insn (temp, temp2);
+
+	  switch (vmode)
+	    {
+	    case V16QImode:
+	      emit_insn (gen_msa_fill_b_insn (target, temp));
+	      break;
+
+	    case V8HImode:
+	      emit_insn (gen_msa_fill_h_insn (target, temp));
+	      break;
+
+	    case V4SImode:
+	      emit_insn (gen_msa_fill_w (target, temp));
+	      break;
+
+	    case V2DImode:
+	      emit_insn (gen_msa_fill_d (target, temp));
+	      break;
+
+	    case V4SFmode:
+	      emit_insn (gen_msa_splati_w_f_s (target, temp, const0_rtx));
+	      break;
+
+	    case V2DFmode:
+	      emit_insn (gen_msa_splati_d_f_s (target, temp, const0_rtx));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  return;
+	}
+      rtvec vec = shallow_copy_rtvec (XVEC (vals, 0));
+
+      for (i = 0; i < nelt; ++i)
+	RTVEC_ELT (vec, i) = CONST0_RTX (imode);
+
+      emit_move_insn (target, gen_rtx_CONST_VECTOR (vmode, vec));
+
+      for (i = 0; i < nelt; ++i)
+	{
+	  rtx temp = gen_reg_rtx (imode);
+	  emit_move_insn (temp, XVECEXP (vals, 0, i));
+	  switch (vmode)
+	    {
+	    case V16QImode:
+	      emit_insn (gen_vec_setv16qi (target, temp, GEN_INT (i)));
+	      break;
+
+	    case V8HImode:
+	      emit_insn (gen_vec_setv8hi (target, temp, GEN_INT (i)));
+	      break;
+
+	    case V4SImode:
+	      emit_insn (gen_vec_setv4si (target, temp, GEN_INT (i)));
+	      break;
+
+	    case V2DImode:
+	      emit_insn (gen_vec_setv2di (target, temp, GEN_INT (i)));
+	      break;
+
+	    case V4SFmode:
+	      emit_insn (gen_vec_setv4sf (target, temp, GEN_INT (i)));
+	      break;
+
+	    case V2DFmode:
+	      emit_insn (gen_vec_setv2df (target, temp, GEN_INT (i)));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      return;
+    }
+
   /* Load constants from the pool, or whatever's handy.  */
   if (nvar == 0)
     {
@@ -18750,7 +21343,7 @@ mips_expand_vector_init (rtx target, rtx vals)
 void
 mips_expand_vec_reduc (rtx target, rtx in, rtx (*gen)(rtx, rtx, rtx))
 {
-  enum machine_mode vmode = GET_MODE (in);
+  machine_mode vmode = GET_MODE (in);
   unsigned char perm2[2];
   rtx last, next, fold, x;
   bool ok;
@@ -18822,7 +21415,7 @@ void
 mips_expand_vec_minmax (rtx target, rtx op0, rtx op1,
 			rtx (*cmp) (rtx, rtx, rtx), bool min_p)
 {
-  enum machine_mode vmode = GET_MODE (target);
+  machine_mode vmode = GET_MODE (target);
   rtx tc, t0, t1, x;
 
   tc = gen_reg_rtx (vmode);
@@ -18843,6 +21436,445 @@ mips_expand_vec_minmax (rtx target, rtx op0, rtx op1,
   emit_insn (gen_rtx_SET (VOIDmode, target, x));
 }
 
+/* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
+
+machine_mode
+mips_hard_regno_caller_save_mode (unsigned int regno,
+				  unsigned int nregs,
+				  machine_mode mode)
+{
+  /* For performance, avoid saving/restoring upper parts of a register
+     by returning MODE as save mode when the mode is known.  */
+  if (mode == VOIDmode)
+    return choose_hard_reg_mode (regno, nregs, false);
+  else
+    return mode;
+}
+
+static void
+mips_expand_msa_one_cmpl (rtx dest, rtx src)
+{
+  machine_mode mode = GET_MODE (dest);
+  switch (mode)
+    {
+    case V16QImode:
+      emit_insn (gen_msa_nor_v_b (dest, src, src));
+      break;
+    case V8HImode:
+      emit_insn (gen_msa_nor_v_h (dest, src, src));
+      break;
+    case V4SImode:
+      emit_insn (gen_msa_nor_v_w (dest, src, src));
+      break;
+    case V2DImode:
+      emit_insn (gen_msa_nor_v_d (dest, src, src));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static void
+mips_expand_msa_cmp (rtx dest, enum rtx_code cond, rtx op0, rtx op1)
+{
+  machine_mode cmp_mode = GET_MODE (op0);
+
+  switch (cmp_mode)
+    {
+    case V16QImode:
+      switch (cond)
+	{
+	case EQ:
+	  emit_insn (gen_msa_ceq_b (dest, op0, op1));
+	  break;
+	case LT:
+	  emit_insn (gen_msa_clt_s_b (dest, op0, op1));
+	  break;
+	case LE:
+	  emit_insn (gen_msa_cle_s_b (dest, op0, op1));
+	  break;
+	case LTU:
+	  emit_insn (gen_msa_clt_u_b (dest, op0, op1));
+	  break;
+	case LEU:
+	  emit_insn (gen_msa_cle_u_b (dest, op0, op1));
+	  break;
+	case GE: // swap
+	  emit_insn (gen_msa_cle_s_b (dest, op1, op0));
+	  break;
+	case GT: // swap
+	  emit_insn (gen_msa_clt_s_b (dest, op1, op0));
+	  break;
+	case GEU: // swap
+	  emit_insn (gen_msa_cle_u_b (dest, op1, op0));
+	  break;
+	case GTU: // swap
+	  emit_insn (gen_msa_clt_u_b (dest, op1, op0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V8HImode:
+      switch (cond)
+	{
+	case EQ:
+	  emit_insn (gen_msa_ceq_h (dest, op0, op1));
+	  break;
+	case LT:
+	  emit_insn (gen_msa_clt_s_h (dest, op0, op1));
+	  break;
+	case LE:
+	  emit_insn (gen_msa_cle_s_h (dest, op0, op1));
+	  break;
+	case LTU:
+	  emit_insn (gen_msa_clt_u_h (dest, op0, op1));
+	  break;
+	case LEU:
+	  emit_insn (gen_msa_cle_u_h (dest, op0, op1));
+	  break;
+	case GE: // swap
+	  emit_insn (gen_msa_cle_s_h (dest, op1, op0));
+	  break;
+	case GT: // swap
+	  emit_insn (gen_msa_clt_s_h (dest, op1, op0));
+	  break;
+	case GEU: // swap
+	  emit_insn (gen_msa_cle_u_h (dest, op1, op0));
+	  break;
+	case GTU: // swap
+	  emit_insn (gen_msa_clt_u_h (dest, op1, op0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V4SImode:
+      switch (cond)
+	{
+	case EQ:
+	  emit_insn (gen_msa_ceq_w (dest, op0, op1));
+	  break;
+	case LT:
+	  emit_insn (gen_msa_clt_s_w (dest, op0, op1));
+	  break;
+	case LE:
+	  emit_insn (gen_msa_cle_s_w (dest, op0, op1));
+	  break;
+	case LTU:
+	  emit_insn (gen_msa_clt_u_w (dest, op0, op1));
+	  break;
+	case LEU:
+	  emit_insn (gen_msa_cle_u_w (dest, op0, op1));
+	  break;
+	case GE: // swap
+	  emit_insn (gen_msa_cle_s_w (dest, op1, op0));
+	  break;
+	case GT: // swap
+	  emit_insn (gen_msa_clt_s_w (dest, op1, op0));
+	  break;
+	case GEU: // swap
+	  emit_insn (gen_msa_cle_u_w (dest, op1, op0));
+	  break;
+	case GTU: // swap
+	  emit_insn (gen_msa_clt_u_w (dest, op1, op0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V2DImode:
+      switch (cond)
+	{
+	case EQ:
+	  emit_insn (gen_msa_ceq_d (dest, op0, op1));
+	  break;
+	case LT:
+	  emit_insn (gen_msa_clt_s_d (dest, op0, op1));
+	  break;
+	case LE:
+	  emit_insn (gen_msa_cle_s_d (dest, op0, op1));
+	  break;
+	case LTU:
+	  emit_insn (gen_msa_clt_u_d (dest, op0, op1));
+	  break;
+	case LEU:
+	  emit_insn (gen_msa_cle_u_d (dest, op0, op1));
+	  break;
+	case GE: // swap
+	  emit_insn (gen_msa_cle_s_d (dest, op1, op0));
+	  break;
+	case GT: // swap
+	  emit_insn (gen_msa_clt_s_d (dest, op1, op0));
+	  break;
+	case GEU: // swap
+	  emit_insn (gen_msa_cle_u_d (dest, op1, op0));
+	  break;
+	case GTU: // swap
+	  emit_insn (gen_msa_clt_u_d (dest, op1, op0));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V4SFmode:
+      switch (cond)
+	{
+	case UNORDERED:
+	  emit_insn (gen_msa_fcun_w (dest, op0, op1));
+	  break;
+	case EQ:
+	  emit_insn (gen_msa_fceq_w (dest, op0, op1));
+	  break;
+	case NE:
+	  emit_insn (gen_msa_fcne_w (dest, op0, op1));
+	  break;
+	case LTGT:
+	  emit_insn (gen_msa_fcne_w (dest, op0, op1));
+	  break;
+	case GT: // use slt, swap op0 and op1
+	  emit_insn (gen_msa_fslt_w (dest, op1, op0));
+	  break;
+	case GE: // use sle, swap op0 and op1
+	  emit_insn (gen_msa_fsle_w (dest, op1, op0));
+	  break;
+	case LT: // use slt
+	  emit_insn (gen_msa_fslt_w (dest, op0, op1));
+	  break;
+	case LE: // use sle
+	  emit_insn (gen_msa_fsle_w (dest, op0, op1));
+	  break;
+	case UNGE: // use cule, swap op0 and op1
+	  emit_insn (gen_msa_fcule_w (dest, op1, op0));
+	  break;
+	case UNGT: // use cult, swap op0 and op1
+	  emit_insn (gen_msa_fcult_w (dest, op1, op0));
+	  break;
+	case UNLE:
+	  emit_insn (gen_msa_fcule_w (dest, op0, op1));
+	  break;
+	case UNLT:
+	  emit_insn (gen_msa_fcult_w (dest, op0, op1));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V2DFmode:
+      switch (cond)
+	{
+	case UNORDERED:
+	  emit_insn (gen_msa_fcun_d (dest, op0, op1));
+	  break;
+	case EQ:
+	  emit_insn (gen_msa_fceq_d (dest, op0, op1));
+	  break;
+	case NE:
+	  emit_insn (gen_msa_fcne_d (dest, op0, op1));
+	  break;
+	case LTGT:
+	  emit_insn (gen_msa_fcne_d (dest, op0, op1));
+	  break;
+	case GT: // use slt, swap op0 and op1
+	  emit_insn (gen_msa_fslt_d (dest, op1, op0));
+	  break;
+	case GE: // use sle, swap op0 and op1
+	  emit_insn (gen_msa_fsle_d (dest, op1, op0));
+	  break;
+	case LT: // use slt
+	  emit_insn (gen_msa_fslt_d (dest, op0, op1));
+	  break;
+	case LE: // use sle
+	  emit_insn (gen_msa_fsle_d (dest, op0, op1));
+	  break;
+	case UNGE: // use cule, swap op0 and op1
+	  emit_insn (gen_msa_fcule_d (dest, op1, op0));
+	  break;
+	case UNGT: // use uclt, swap op0 and op1
+	  emit_insn (gen_msa_fcult_d (dest, op1, op0));
+	  break;
+	case UNLE:
+	  emit_insn (gen_msa_fcule_d (dest, op0, op1));
+	  break;
+	case UNLT:
+	  emit_insn (gen_msa_fcult_d (dest, op0, op1));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    default:
+      gcc_unreachable ();
+      break;
+    }
+}
+
+static bool
+mips_msa_reversed_int_cond (enum rtx_code *cond)
+{
+  switch (*cond)
+    {
+    case NE:
+      *cond = EQ;
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+static bool
+mips_msa_reversed_fp_cond (enum rtx_code *code)
+{
+  switch (*code)
+    {
+    case ORDERED:
+    case UNEQ:
+      *code = reverse_condition_maybe_unordered (*code);
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Generate RTL for comparing CMP_OP0, CMP_ OP1 using condition COND
+   and store the result -1 or 0 in DEST TRUE_SRC and DEST_SRC
+   must be -1 and 0 respectively.  */
+
+static void
+mips_expand_msa_vcond (rtx dest, rtx true_src, rtx false_src,
+		       enum rtx_code cond, rtx cmp_op0, rtx cmp_op1)
+{
+  machine_mode dest_mode = GET_MODE (dest);
+  machine_mode cmp_mode = GET_MODE (cmp_op0);
+  bool reversed_p;
+
+  if (FLOAT_MODE_P (cmp_mode))
+    reversed_p = mips_msa_reversed_fp_cond (&cond);
+  else
+    reversed_p = mips_msa_reversed_int_cond (&cond);
+
+  mips_expand_msa_cmp (dest, cond, cmp_op0, cmp_op1);
+  if (reversed_p)
+    mips_expand_msa_one_cmpl (dest, dest);
+
+  /* MSA vcond only produces result -1 and 0 for true and false.  */
+  gcc_assert ((true_src == CONSTM1_RTX (dest_mode))
+	      && (false_src == CONST0_RTX (dest_mode)));
+}
+
+/* Expand VEC_COND_EXPR, where:
+   MODE is mode of the result
+   VIMODE equivalent integer mode
+   OPERANDS operands of VEC_COND_EXPR
+   gen_msa_and_fn used to generate a VIMODE vector msa AND
+   gen_msa_nor_fn used to generate a VIMODE vector msa NOR
+   gen_msa_ior_fn used to generate a VIMODE vector msa AND.  */
+
+void
+mips_expand_vec_cond_expr (machine_mode mode,
+			   machine_mode vimode,
+			   rtx *operands,
+			   rtx (*gen_msa_and_fn)(rtx, rtx, rtx),
+			   rtx (*gen_msa_nor_fn)(rtx, rtx, rtx),
+			   rtx (*gen_msa_ior_fn)(rtx, rtx, rtx))
+{
+  rtx true_val = CONSTM1_RTX (vimode);
+  rtx false_val = CONST0_RTX (vimode);
+
+  if (operands[1] == true_val && operands[2] == false_val)
+    mips_expand_msa_vcond (operands[0], operands[1], operands[2],
+			   GET_CODE (operands[3]), operands[4], operands[5]);
+  else
+    {
+      rtx res = gen_reg_rtx (vimode);
+      rtx temp1 = gen_reg_rtx (vimode);
+      rtx temp2 = gen_reg_rtx (vimode);
+      rtx xres = gen_reg_rtx (vimode);
+
+      mips_expand_msa_vcond (res, true_val, false_val,
+			     GET_CODE (operands[3]), operands[4], operands[5]);
+
+      /* This results in a vector result with whose T/F elements having
+	 the value -1 or 0 for (T/F repectively).  This result may need
+	 adjusting if needed results operands[]/operands[1] are different.  */
+
+      /* Adjust True elements to be operand[1].  */
+      emit_move_insn (xres, res);
+      if (operands[1] != true_val)
+	{
+	  rtx xop1 = operands[1]; /* Assume we can use operands[1].  */
+
+	  if (mode != vimode)
+	    {
+	      xop1 = gen_reg_rtx (vimode);
+	      if (GET_CODE (operands[1]) == CONST_VECTOR)
+		{
+		  rtx xtemp = gen_reg_rtx (mode);
+		  emit_move_insn (xtemp, operands[1]);
+		  emit_move_insn (xop1,
+				  gen_rtx_SUBREG (vimode, xtemp, 0));
+		}
+	      else
+		emit_move_insn (xop1,
+				gen_rtx_SUBREG (vimode, operands[1], 0));
+	    }
+	  else if (GET_CODE (operands[1]) == CONST_VECTOR)
+	    {
+	      xop1 = gen_reg_rtx (mode);
+	      emit_move_insn (xop1, operands[1]);
+	    }
+
+	  emit_insn (gen_msa_and_fn (temp1, xres, xop1));
+	}
+      else
+	emit_move_insn (temp1, xres);
+
+      /* Adjust False elements to be operand[0].  */
+      emit_insn (gen_msa_nor_fn (temp2, xres, xres));
+      if (operands[2] != false_val)
+	{
+	  rtx xop2 = operands[2]; ; /* Assume we can use operands[2].  */
+
+	  if (mode != vimode)
+	    {
+	      xop2 = gen_reg_rtx (vimode);
+	      if (GET_CODE (operands[2]) == CONST_VECTOR)
+		{
+		  rtx xtemp = gen_reg_rtx (mode);
+		  emit_move_insn (xtemp, operands[2]);
+		  emit_move_insn (xop2,
+				  gen_rtx_SUBREG (vimode, xtemp, 0));
+		}
+	      else
+		emit_move_insn (xop2,
+				gen_rtx_SUBREG (vimode, operands[2], 0));
+	    }
+	  else if (GET_CODE (operands[2]) == CONST_VECTOR)
+	    {
+	      xop2 = gen_reg_rtx (mode);
+	      emit_move_insn (xop2, operands[2]);
+	    }
+
+	  emit_insn (gen_msa_and_fn (temp2, temp2, xop2));
+	}
+      else
+	emit_insn (gen_msa_and_fn (temp2, temp2, xres));
+
+      /* Combine together into result.  */
+      emit_insn (gen_msa_ior_fn (xres, temp1, temp2));
+      emit_move_insn (operands[0],
+		      gen_rtx_SUBREG (mode, xres, 0));
+    }
+}
+
 /* Implement TARGET_CASE_VALUES_THRESHOLD.  */
 
 unsigned int
@@ -18897,6 +21929,25 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
   *update = build2 (COMPOUND_EXPR, void_type_node, *update,
 		    atomic_feraiseexcept_call);
 }
+
+/* Implement TARGET_SPILL_CLASS.  */
+
+static reg_class_t
+mips_spill_class (reg_class_t rclass ATTRIBUTE_UNUSED,
+		  machine_mode mode ATTRIBUTE_UNUSED)
+{
+  if (TARGET_MIPS16)
+    return SPILL_REGS;
+  return NO_REGS;
+}
+
+/* Implement TARGET_LRA_P.  */
+
+static bool
+mips_lra_p (void)
+{
+  return mips_lra_flag;
+}
 
 /* Initialize the GCC target structure.  */
 #undef TARGET_ASM_ALIGNED_HI_OP
@@ -18960,6 +22011,8 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 #define TARGET_VALID_POINTER_MODE mips_valid_pointer_mode
 #undef TARGET_REGISTER_MOVE_COST
 #define TARGET_REGISTER_MOVE_COST mips_register_move_cost
+#undef TARGET_REGISTER_PRIORITY
+#define TARGET_REGISTER_PRIORITY mips_register_priority
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST mips_memory_move_cost
 #undef TARGET_RTX_COSTS
@@ -19041,10 +22094,17 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 #define TARGET_FUNCTION_ARG_ADVANCE mips_function_arg_advance
 #undef TARGET_FUNCTION_ARG_BOUNDARY
 #define TARGET_FUNCTION_ARG_BOUNDARY mips_function_arg_boundary
+#undef TARGET_GET_RAW_RESULT_MODE
+#define TARGET_GET_RAW_RESULT_MODE mips_get_reg_raw_mode
+#undef TARGET_GET_RAW_ARG_MODE
+#define TARGET_GET_RAW_ARG_MODE mips_get_reg_raw_mode
 
 #undef TARGET_MODE_REP_EXTENDED
 #define TARGET_MODE_REP_EXTENDED mips_mode_rep_extended
 
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+  mips_builtin_vectorized_function
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P mips_vector_mode_supported_p
 
@@ -19053,6 +22113,9 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE mips_preferred_simd_mode
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
+  mips_autovectorize_vector_sizes
 
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS mips_init_builtins
@@ -19097,6 +22160,8 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 #endif
 #undef TARGET_DWARF_REGISTER_SPAN
 #define TARGET_DWARF_REGISTER_SPAN mips_dwarf_register_span
+#undef TARGET_DWARF_FRAME_REG_MODE
+#define TARGET_DWARF_FRAME_REG_MODE mips_dwarf_frame_reg_mode
 
 #undef TARGET_ASM_FINAL_POSTSCAN_INSN
 #define TARGET_ASM_FINAL_POSTSCAN_INSN mips_final_postscan_insn
@@ -19128,12 +22193,32 @@ mips_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK mips_vectorize_vec_perm_const_ok
 
+#undef TARGET_SCHED_REASSOCIATION_WIDTH
+#define TARGET_SCHED_REASSOCIATION_WIDTH mips_sched_reassociation_width
+
 #undef TARGET_CASE_VALUES_THRESHOLD
 #define TARGET_CASE_VALUES_THRESHOLD mips_case_values_threshold
 
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV mips_atomic_assign_expand_fenv
 
+#undef TARGET_SPILL_CLASS
+#define TARGET_SPILL_CLASS mips_spill_class
+#undef TARGET_LRA_P
+#define TARGET_LRA_P mips_lra_p
+
+#undef TARGET_SCHED_INIT_GLOBAL
+#define TARGET_SCHED_INIT_GLOBAL mips_sched_init_global
+
+#undef TARGET_SCHED_FINISH_GLOBAL
+#define TARGET_SCHED_FINISH_GLOBAL mips_sched_finish_global
+
+#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
+#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK mips_evaluation_hook
+
+#undef TARGET_SCHED_SET_SCHED_FLAGS
+#define TARGET_SCHED_SET_SCHED_FLAGS mips_set_sched_flags
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-mips.h"
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index bedc45b54aa..f46fec5f7d6 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -92,6 +92,21 @@ struct mips_cpu_info {
 /* True if we are generating position-independent VxWorks RTP code.  */
 #define TARGET_RTP_PIC (TARGET_VXWORKS_RTP && flag_pic)
 
+/* True depending on the compact branch policy.  */
+#define TARGET_CB_NEVER (mips_cb == MIPS_CB_NEVER	\
+			 || (mips_cb == MIPS_CB_OPTIMAL \
+			     && !ISA_HAS_COMPACT_BRANCHES))
+#define TARGET_CB_MAYBE (TARGET_CB_ALWAYS		\
+			 || (mips_cb == MIPS_CB_OPTIMAL \
+			     && ISA_HAS_COMPACT_BRANCHES))
+#define TARGET_CB_ALWAYS (mips_cb == MIPS_CB_ALWAYS	\
+			 || (mips_cb == MIPS_CB_OPTIMAL \
+			     && !ISA_HAS_DELAY_SLOTS))
+
+#define ISA_HAS_JRC (ISA_HAS_COMPACT_BRANCHES		\
+		     || (TARGET_MICROMIPS		\
+			 && mips_cb == MIPS_CB_OPTIMAL))
+
 /* True if the output file is marked as ".abicalls; .option pic0"
    (-call_nonpic).  */
 #define TARGET_ABICALLS_PIC0 \
@@ -181,6 +196,14 @@ struct mips_cpu_info {
 #define ISA_HAS_DSP_MULT ISA_HAS_DSPR2
 #endif
 
+/* ISA has LSA available.  */
+#define ISA_HAS_LSA		(mips_isa_rev >= 6 || ISA_HAS_MSA)
+
+/* ISA has DLSA available.  */
+#define ISA_HAS_DLSA		(TARGET_64BIT \
+				 && (mips_isa_rev >= 6 \
+				     || ISA_HAS_MSA))
+
 /* The ISA compression flags that are currently in effect.  */
 #define TARGET_COMPRESSION (target_flags & (MASK_MIPS16 | MASK_MICROMIPS))
 
@@ -208,8 +231,14 @@ struct mips_cpu_info {
 #define ISA_MIPS4		    (mips_isa == 4)
 #define ISA_MIPS32		    (mips_isa == 32)
 #define ISA_MIPS32R2		    (mips_isa == 33)
+#define ISA_MIPS32R3		    (mips_isa == 34)
+#define ISA_MIPS32R5		    (mips_isa == 36)
+#define ISA_MIPS32R6		    (mips_isa == 37)
 #define ISA_MIPS64                  (mips_isa == 64)
 #define ISA_MIPS64R2		    (mips_isa == 65)
+#define ISA_MIPS64R3		    (mips_isa == 66)
+#define ISA_MIPS64R5		    (mips_isa == 68)
+#define ISA_MIPS64R6		    (mips_isa == 69)
 
 /* Architecture target defines.  */
 #define TARGET_LOONGSON_2E          (mips_arch == PROCESSOR_LOONGSON_2E)
@@ -226,8 +255,10 @@ struct mips_cpu_info {
 #define TARGET_MIPS7000             (mips_arch == PROCESSOR_R7000)
 #define TARGET_MIPS9000             (mips_arch == PROCESSOR_R9000)
 #define TARGET_OCTEON		    (mips_arch == PROCESSOR_OCTEON	\
-				     || mips_arch == PROCESSOR_OCTEON2)
-#define TARGET_OCTEON2		    (mips_arch == PROCESSOR_OCTEON2)
+				     || mips_arch == PROCESSOR_OCTEON2	\
+				     || mips_arch == PROCESSOR_OCTEON3)
+#define TARGET_OCTEON2		    (mips_arch == PROCESSOR_OCTEON2	\
+				     || mips_arch == PROCESSOR_OCTEON3)
 #define TARGET_SB1                  (mips_arch == PROCESSOR_SB1		\
 				     || mips_arch == PROCESSOR_SB1A)
 #define TARGET_SR71K                (mips_arch == PROCESSOR_SR71000)
@@ -257,9 +288,12 @@ struct mips_cpu_info {
 #define TUNE_MIPS7000               (mips_tune == PROCESSOR_R7000)
 #define TUNE_MIPS9000               (mips_tune == PROCESSOR_R9000)
 #define TUNE_OCTEON		    (mips_tune == PROCESSOR_OCTEON	\
-				     || mips_tune == PROCESSOR_OCTEON2)
+				     || mips_tune == PROCESSOR_OCTEON2	\
+				     || mips_tune == PROCESSOR_OCTEON3)
 #define TUNE_SB1                    (mips_tune == PROCESSOR_SB1		\
 				     || mips_tune == PROCESSOR_SB1A)
+#define TUNE_P5600                  (mips_tune == PROCESSOR_P5600)
+#define TUNE_I6400                  (mips_tune == PROCESSOR_I6400)
 
 /* Whether vector modes and intrinsics for ST Microelectronics
    Loongson-2E/2F processors should be enabled.  In o32 pairs of
@@ -302,7 +336,8 @@ struct mips_cpu_info {
 #define TUNE_MACC_CHAINS	    (TUNE_MIPS5500		\
 				     || TUNE_MIPS4120		\
 				     || TUNE_MIPS4130		\
-				     || TUNE_24K)
+				     || TUNE_24K		\
+				     || TUNE_P5600)
 
 #define TARGET_OLDABI		    (mips_abi == ABI_32 || mips_abi == ABI_O64)
 #define TARGET_NEWABI		    (mips_abi == ABI_N32 || mips_abi == ABI_64)
@@ -314,6 +349,15 @@ struct mips_cpu_info {
 #define TARGET_HARD_FLOAT (TARGET_HARD_FLOAT_ABI && !TARGET_MIPS16)
 #define TARGET_SOFT_FLOAT (TARGET_SOFT_FLOAT_ABI || TARGET_MIPS16)
 
+/* TARGET_FLOAT64 represents -mfp64 and TARGET_FLOATXX represents
+   -mfpxx, derive TARGET_FLOAT32 to represent -mfp32.  */
+#define TARGET_FLOAT32 (!TARGET_FLOAT64 && !TARGET_FLOATXX)
+
+/* TARGET_O32_FP64A_ABI represents all the conditions that form the
+   o32 FP64A ABI extension (-mabi=32 -mfp64 -mno-odd-spreg).  */
+#define TARGET_O32_FP64A_ABI (mips_abi == ABI_32 && TARGET_FLOAT64 \
+			      && !TARGET_ODD_SPREG)
+
 /* False if SC acts as a memory barrier with respect to itself,
    otherwise a SYNC will be emitted after SC for atomic operations
    that require ordering between the SC and following loads and
@@ -382,6 +426,8 @@ struct mips_cpu_info {
 									\
       if (TARGET_FLOAT64)						\
 	builtin_define ("__mips_fpr=64");				\
+      else if (TARGET_FLOATXX)						\
+	builtin_define ("__mips_fpr=0");				\
       else								\
 	builtin_define ("__mips_fpr=32");				\
 									\
@@ -411,10 +457,21 @@ struct mips_cpu_info {
 	      builtin_define ("__mips_dspr2");				\
 	      builtin_define ("__mips_dsp_rev=2");			\
 	    }								\
+	  else if (TARGET_DSPR3)					\
+	    {								\
+	      builtin_define ("__mips_dspr3");				\
+	      builtin_define ("__mips_dsp_rev=3");			\
+	    }								\
 	  else								\
 	    builtin_define ("__mips_dsp_rev=1");			\
 	}								\
 									\
+      if (TARGET_MSA)							\
+	{								\
+	  builtin_define ("__mips_msa");				\
+	  builtin_define ("__mips_msa_width=128");			\
+	}								\
+									\
       MIPS_CPP_SET_PROCESSOR ("_MIPS_ARCH", mips_arch_info);		\
       MIPS_CPP_SET_PROCESSOR ("_MIPS_TUNE", mips_tune_info);		\
 									\
@@ -438,30 +495,19 @@ struct mips_cpu_info {
 	  builtin_define ("__mips=4");					\
 	  builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS4");			\
 	}								\
-      else if (ISA_MIPS32)						\
+      else if (mips_isa >= 32 && mips_isa < 64)				\
 	{								\
 	  builtin_define ("__mips=32");					\
-	  builtin_define ("__mips_isa_rev=1");				\
 	  builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS32");		\
 	}								\
-      else if (ISA_MIPS32R2)						\
-	{								\
-	  builtin_define ("__mips=32");					\
-	  builtin_define ("__mips_isa_rev=2");				\
-	  builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS32");		\
-	}								\
-      else if (ISA_MIPS64)						\
+      else if (mips_isa >= 64)						\
 	{								\
 	  builtin_define ("__mips=64");					\
-	  builtin_define ("__mips_isa_rev=1");				\
-	  builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS64");		\
-	}								\
-      else if (ISA_MIPS64R2)						\
-	{								\
-	  builtin_define ("__mips=64");					\
-	  builtin_define ("__mips_isa_rev=2");				\
 	  builtin_define ("_MIPS_ISA=_MIPS_ISA_MIPS64");		\
 	}								\
+      if (mips_isa_rev > 0)						\
+	builtin_define_with_int_value ("__mips_isa_rev",		\
+				       mips_isa_rev);			\
 									\
       switch (mips_abi)							\
 	{								\
@@ -491,6 +537,8 @@ struct mips_cpu_info {
       builtin_define_with_int_value ("_MIPS_SZPTR", POINTER_SIZE);	\
       builtin_define_with_int_value ("_MIPS_FPSET",			\
 				     32 / MAX_FPRS_PER_FMT);		\
+      builtin_define_with_int_value ("_MIPS_SPFPSET",			\
+				     TARGET_ODD_SPREG ? 32 : 16);	\
 									\
       /* These defines reflect the ABI in use, not whether the  	\
 	 FPU is directly accessible.  */				\
@@ -632,10 +680,14 @@ struct mips_cpu_info {
 #define MULTILIB_ISA_DEFAULT "mips32"
 #elif MIPS_ISA_DEFAULT == 33
 #define MULTILIB_ISA_DEFAULT "mips32r2"
+#elif MIPS_ISA_DEFAULT == 37
+#define MULTILIB_ISA_DEFAULT "mips32r6"
 #elif MIPS_ISA_DEFAULT == 64
 #define MULTILIB_ISA_DEFAULT "mips64"
 #elif MIPS_ISA_DEFAULT == 65
 #define MULTILIB_ISA_DEFAULT "mips64r2"
+#elif MIPS_ISA_DEFAULT == 69
+#define MULTILIB_ISA_DEFAULT "mips64r6"
 #else
 #define MULTILIB_ISA_DEFAULT "mips1"
 #endif
@@ -700,9 +752,15 @@ struct mips_cpu_info {
      %{march=mips32|march=4kc|march=4km|march=4kp|march=4ksc:-mips32} \
      %{march=mips32r2|march=m4k|march=4ke*|march=4ksd|march=24k* \
        |march=34k*|march=74k*|march=m14k*|march=1004k*: -mips32r2} \
+     %{march=mips32r3: -mips32r3} \
+     %{march=mips32r5|march=p5600: -mips32r5} \
+     %{march=mips32r6: -mips32r6} \
      %{march=mips64|march=5k*|march=20k*|march=sb1*|march=sr71000 \
        |march=xlr: -mips64} \
      %{march=mips64r2|march=loongson3a|march=octeon|march=xlp: -mips64r2} \
+     %{march=mips64r3: -mips64r3} \
+     %{march=mips64r5: -mips64r5} \
+     %{march=mips64r6|march=i6400: -mips64r6} \
      %{!march=*: -" MULTILIB_ISA_DEFAULT "}}"
 
 /* A spec that infers a -mhard-float or -msoft-float setting from an
@@ -722,10 +780,20 @@ struct mips_cpu_info {
 #define MIPS_32BIT_OPTION_SPEC \
   "mips1|mips2|mips32*|mgp32"
 
+/* A spec condition that matches architectures should be targeted with
+   o32 FPXX for compatibility reasons.  */
+#define MIPS_FPXX_OPTION_SPEC \
+  "mips2|mips3|mips4|mips5|mips32|mips32r2|mips32r3|mips32r5| \
+   mips64|mips64r2|mips64r3|mips64r5"
+
 /* Infer a -msynci setting from a -mips argument, on the assumption that
    -msynci is desired where possible.  */
 #define MIPS_ISA_SYNCI_SPEC \
-  "%{msynci|mno-synci:;:%{mips32r2|mips64r2:-msynci;:-mno-synci}}"
+  "%{msynci|mno-synci:;:%{mips32r2|mips32r3|mips32r5|mips32r6|mips64r2 \
+			  |mips64r3|mips64r5|mips64r6:-msynci;:-mno-synci}}"
+
+#define MIPS_ISA_NAN2008_SPEC \
+  "%{mnan*:;mips32r6|mips64r6:-mnan=2008}"
 
 #if (MIPS_ABI_DEFAULT == ABI_O64 \
      || MIPS_ABI_DEFAULT == ABI_N32 \
@@ -745,7 +813,13 @@ struct mips_cpu_info {
    --with-abi is ignored if -mabi is specified.
    --with-float is ignored if -mhard-float or -msoft-float are
      specified.
+   --with-fpu is ignored if -msoft-float, -msingle-float or -mdouble-float are
+     specified.
    --with-nan is ignored if -mnan is specified.
+   --with-fp-32 is ignored if -msoft-float, -msingle-float, -mmsa or -mfp are
+     specified.
+   --with-odd-spreg-32 is ignored if -msoft-float, -msingle-float, -modd-spreg
+     or -mno-odd-spreg are specified.
    --with-divide is ignored if -mdivide-traps or -mdivide-breaks are
      specified. */
 #define OPTION_DEFAULT_SPECS \
@@ -757,8 +831,12 @@ struct mips_cpu_info {
   {"tune_64", "%{" OPT_ARCH64 ":%{!mtune=*:-mtune=%(VALUE)}}" }, \
   {"abi", "%{!mabi=*:-mabi=%(VALUE)}" }, \
   {"float", "%{!msoft-float:%{!mhard-float:-m%(VALUE)-float}}" }, \
-  {"fpu", "%{!msingle-float:%{!mdouble-float:-m%(VALUE)-float}}" }, \
+  {"fpu", "%{!msoft-float:%{!msingle-float:%{!mdouble-float:-m%(VALUE)-float}}}" }, \
   {"nan", "%{!mnan=*:-mnan=%(VALUE)}" }, \
+  {"fp_32", "%{" OPT_ARCH32 \
+	    ":%{!msoft-float:%{!msingle-float:%{!mfp*:%{!mmsa:-mfp%(VALUE)}}}}}" }, \
+  {"odd_spreg_32", "%{" OPT_ARCH32 ":%{!msoft-float:%{!msingle-float:" \
+		   "%{!modd-spreg:%{!mno-odd-spreg:-m%(VALUE)}}}}}" }, \
   {"divide", "%{!mdivide-traps:%{!mdivide-breaks:-mdivide-%(VALUE)}}" }, \
   {"llsc", "%{!mllsc:%{!mno-llsc:-m%(VALUE)}}" }, \
   {"mips-plt", "%{!mplt:%{!mno-plt:-m%(VALUE)}}" }, \
@@ -766,11 +844,14 @@ struct mips_cpu_info {
 
 /* A spec that infers the -mdsp setting from an -march argument.  */
 #define BASE_DRIVER_SELF_SPECS \
+  MIPS_ISA_NAN2008_SPEC,       \
   "%{!mno-dsp: \
      %{march=24ke*|march=34kc*|march=34kf*|march=34kx*|march=1004k*: -mdsp} \
      %{march=74k*|march=m14ke*: %{!mno-dspr2: -mdspr2 -mdsp}}}"
 
-#define DRIVER_SELF_SPECS BASE_DRIVER_SELF_SPECS
+#define DRIVER_SELF_SPECS \
+  MIPS_ISA_LEVEL_SPEC,	  \
+  BASE_DRIVER_SELF_SPECS
 
 #define GENERATE_DIVIDE_TRAPS (TARGET_DIVIDE_TRAPS \
                                && ISA_HAS_COND_TRAP)
@@ -801,12 +882,27 @@ struct mips_cpu_info {
 #define ISA_HAS_64BIT_REGS	(ISA_MIPS3				\
 				 || ISA_MIPS4				\
 				 || ISA_MIPS64				\
-				 || ISA_MIPS64R2)
+				 || ISA_MIPS64R2			\
+				 || ISA_MIPS64R3			\
+				 || ISA_MIPS64R5			\
+				 || ISA_MIPS64R6)
+
+#define ISA_HAS_JR		(mips_isa_rev <= 5)
+
+#define ISA_HAS_DELAY_SLOTS	1
+
+#define ISA_HAS_COMPACT_BRANCHES (mips_isa_rev >= 6)
 
 /* ISA has branch likely instructions (e.g. mips2).  */
 /* Disable branchlikely for tx39 until compare rewrite.  They haven't
    been generated up to this point.  */
-#define ISA_HAS_BRANCHLIKELY	(!ISA_MIPS1)
+#define ISA_HAS_BRANCHLIKELY	(!ISA_MIPS1 && mips_isa_rev <= 5)
+
+/* ISA has 32 single-precision registers.  */
+#define ISA_HAS_ODD_SPREG	((mips_isa_rev >= 1			\
+				  && !TARGET_LOONGSON_3A)		\
+				 || TARGET_FLOAT64			\
+				 || TARGET_MIPS5900)
 
 /* ISA has a three-operand multiplication instruction (usually spelt "mul").  */
 #define ISA_HAS_MUL3		((TARGET_MIPS3900                       \
@@ -816,10 +912,8 @@ struct mips_cpu_info {
 				  || TARGET_MIPS7000			\
 				  || TARGET_MIPS9000			\
 				  || TARGET_MAD				\
-				  || ISA_MIPS32				\
-				  || ISA_MIPS32R2			\
-				  || ISA_MIPS64				\
-				  || ISA_MIPS64R2)			\
+				  || (mips_isa_rev >= 1			\
+				      && mips_isa_rev <= 5))		\
 				 && !TARGET_MIPS16)
 
 /* ISA has a three-operand multiplication instruction.  */
@@ -827,33 +921,48 @@ struct mips_cpu_info {
 				 && TARGET_OCTEON			\
 				 && !TARGET_MIPS16)
 
+/* ISA has HI and LO registers.  */
+#define ISA_HAS_HILO		(mips_isa_rev <= 5)
+
 /* ISA supports instructions DMULT and DMULTU. */
-#define ISA_HAS_DMULT		(TARGET_64BIT && !TARGET_MIPS5900)
+#define ISA_HAS_DMULT		(TARGET_64BIT				\
+				 && !TARGET_MIPS5900			\
+				 && ISA_HAS_HILO)
 
-/* ISA supports instructions MULT and MULTU.
-   This is always true, but the macro is needed for ISA_HAS_<D>MULT
-   in mips.md.  */
-#define ISA_HAS_MULT		(1)
+/* ISA supports instructions MULT and MULTU.  */
+#define ISA_HAS_MULT		ISA_HAS_HILO
+
+/* ISA supports instructions MUL, MULU, MUH, MUHU.  */
+#define ISA_HAS_R6MUL		(mips_isa_rev >= 6)
+
+/* ISA supports instructions DMUL, DMULU, DMUH, DMUHU.  */
+#define ISA_HAS_R6DMUL		(TARGET_64BIT && mips_isa_rev >= 6)
 
 /* ISA supports instructions DDIV and DDIVU. */
-#define ISA_HAS_DDIV		(TARGET_64BIT && !TARGET_MIPS5900)
+#define ISA_HAS_DDIV		(TARGET_64BIT				\
+				 && !TARGET_MIPS5900			\
+				 && mips_isa_rev <= 5)
 
 /* ISA supports instructions DIV and DIVU.
    This is always true, but the macro is needed for ISA_HAS_<D>DIV
    in mips.md.  */
-#define ISA_HAS_DIV		(1)
+#define ISA_HAS_DIV		(mips_isa_rev <= 5)
 
 #define ISA_HAS_DIV3		((TARGET_LOONGSON_2EF			\
 				  || TARGET_LOONGSON_3A)		\
 				 && !TARGET_MIPS16)
 
+/* ISA supports instructions DIV, DIVU, MOD and MODU.  */
+#define ISA_HAS_R6DIV		(mips_isa_rev >= 6)
+
+/* ISA supports instructions DDIV, DDIVU, DMOD and DMODU.  */
+#define ISA_HAS_R6DDIV		(TARGET_64BIT && mips_isa_rev >= 6)
+
 /* ISA has the floating-point conditional move instructions introduced
    in mips4.  */
 #define ISA_HAS_FP_CONDMOVE	((ISA_MIPS4				\
-				  || ISA_MIPS32				\
-				  || ISA_MIPS32R2			\
-				  || ISA_MIPS64				\
-				  || ISA_MIPS64R2)			\
+				  || (mips_isa_rev >= 1			\
+				      && mips_isa_rev <= 5))		\
 				 && !TARGET_MIPS5500			\
 				 && !TARGET_MIPS16)
 
@@ -871,19 +980,23 @@ struct mips_cpu_info {
 /* ISA has the mips4 FP condition code instructions: FP-compare to CC,
    branch on CC, and move (both FP and non-FP) on CC.  */
 #define ISA_HAS_8CC		(ISA_MIPS4				\
-				 || ISA_MIPS32				\
-				 || ISA_MIPS32R2			\
-				 || ISA_MIPS64				\
-				 || ISA_MIPS64R2)
+				 || (mips_isa_rev >= 1			\
+				     && mips_isa_rev <= 5))
+
+/* ISA has the FP condition code instructions that store the flag in an
+   FP register.  */
+#define ISA_HAS_CCF		(mips_isa_rev >= 6)
+
+#define ISA_HAS_SEL		(mips_isa_rev >= 6)
 
 /* This is a catch all for other mips4 instructions: indexed load, the
    FP madd and msub instructions, and the FP recip and recip sqrt
    instructions.  Note that this macro should only be used by other
    ISA_HAS_* macros.  */
 #define ISA_HAS_FP4		((ISA_MIPS4				\
-				  || ISA_MIPS32R2			\
 				  || ISA_MIPS64				\
-				  || ISA_MIPS64R2)			\
+				  || (mips_isa_rev >= 2			\
+				      && mips_isa_rev <= 5))		\
 				 && !TARGET_MIPS16)
 
 /* ISA has floating-point indexed load and store instructions
@@ -891,17 +1004,22 @@ struct mips_cpu_info {
 #define ISA_HAS_LXC1_SXC1	ISA_HAS_FP4
 
 /* ISA has paired-single instructions.  */
-#define ISA_HAS_PAIRED_SINGLE	(ISA_MIPS32R2 || ISA_MIPS64 || ISA_MIPS64R2)
+#define ISA_HAS_PAIRED_SINGLE	(ISA_MIPS64				\
+				 || (mips_isa_rev >= 2			\
+				     && mips_isa_rev <= 5))
 
 /* ISA has conditional trap instructions.  */
 #define ISA_HAS_COND_TRAP	(!ISA_MIPS1				\
 				 && !TARGET_MIPS16)
 
+/* ISA has conditional trap with immediate instructions.  */
+#define ISA_HAS_COND_TRAPI	(!ISA_MIPS1				\
+				 && mips_isa_rev <= 5			\
+				 && !TARGET_MIPS16)
+
 /* ISA has integer multiply-accumulate instructions, madd and msub.  */
-#define ISA_HAS_MADD_MSUB	(ISA_MIPS32				\
-				 || ISA_MIPS32R2			\
-				 || ISA_MIPS64				\
-				 || ISA_MIPS64R2)
+#define ISA_HAS_MADD_MSUB	(mips_isa_rev >= 1			\
+				 && mips_isa_rev <= 5)
 
 /* Integer multiply-accumulate instructions should be generated.  */
 #define GENERATE_MADD_MSUB	(TARGET_IMADD && !TARGET_MIPS16)
@@ -909,6 +1027,9 @@ struct mips_cpu_info {
 /* ISA has floating-point madd and msub instructions 'd = a * b [+-] c'.  */
 #define ISA_HAS_FP_MADD4_MSUB4  ISA_HAS_FP4
 
+/* ISA has floating-point MADDF and MSUBF instructions 'd = d [+-] a * b'.  */
+#define ISA_HAS_FP_MADDF_MSUBF  (mips_isa_rev >= 6)
+
 /* ISA has floating-point madd and msub instructions 'c = a * b [+-] c'.  */
 #define ISA_HAS_FP_MADD3_MSUB3  TARGET_LOONGSON_2EF
 
@@ -928,19 +1049,23 @@ struct mips_cpu_info {
 				(((ISA_HAS_FP4				\
 				   && ((MODE) == SFmode			\
 				       || ((TARGET_FLOAT64		\
-					    || ISA_MIPS32R2		\
-					    || ISA_MIPS64R2)		\
+					    || mips_isa_rev >= 2)	\
 					   && (MODE) == DFmode)))	\
+				  || (((MODE) == SFmode			\
+				       || (MODE) == DFmode)		\
+				      && (mips_isa_rev >= 6))		\
 				  || (TARGET_SB1			\
 				      && (MODE) == V2SFmode))		\
 				 && !TARGET_MIPS16)
 
+#define ISA_HAS_LWL_LWR		(mips_isa_rev <= 5 && !TARGET_MIPS16)
+
+#define ISA_HAS_IEEE_754_LEGACY	(mips_isa_rev <= 5)
+
+#define ISA_HAS_IEEE_754_2008	(mips_isa_rev >= 2)
+
 /* ISA has count leading zeroes/ones instruction (not implemented).  */
-#define ISA_HAS_CLZ_CLO		((ISA_MIPS32				\
-				  || ISA_MIPS32R2			\
-				  || ISA_MIPS64				\
-				  || ISA_MIPS64R2)			\
-				 && !TARGET_MIPS16)
+#define ISA_HAS_CLZ_CLO		(mips_isa_rev >= 1 && !TARGET_MIPS16)
 
 /* ISA has three operand multiply instructions that put
    the high part in an accumulator: mulhi or mulhiu.  */
@@ -978,8 +1103,7 @@ struct mips_cpu_info {
 				 && !TARGET_MIPS16)
 
 /* ISA has the "ror" (rotate right) instructions.  */
-#define ISA_HAS_ROR		((ISA_MIPS32R2				\
-				  || ISA_MIPS64R2			\
+#define ISA_HAS_ROR		((mips_isa_rev >= 2			\
 				  || TARGET_MIPS5400			\
 				  || TARGET_MIPS5500			\
 				  || TARGET_SR71K			\
@@ -988,19 +1112,18 @@ struct mips_cpu_info {
 
 /* ISA has the WSBH (word swap bytes within halfwords) instruction.
    64-bit targets also provide DSBH and DSHD.  */
-#define ISA_HAS_WSBH		((ISA_MIPS32R2 || ISA_MIPS64R2)		\
-				 && !TARGET_MIPS16)
+#define ISA_HAS_WSBH		(mips_isa_rev >= 2 && !TARGET_MIPS16)
 
 /* ISA has data prefetch instructions.  This controls use of 'pref'.  */
 #define ISA_HAS_PREFETCH	((ISA_MIPS4				\
 				  || TARGET_LOONGSON_2EF		\
 				  || TARGET_MIPS5900			\
-				  || ISA_MIPS32				\
-				  || ISA_MIPS32R2			\
-				  || ISA_MIPS64				\
-				  || ISA_MIPS64R2)			\
+				  || mips_isa_rev >= 1)			\
 				 && !TARGET_MIPS16)
 
+/* ISA has data prefetch with limited 9-bit displacement.  */
+#define ISA_HAS_PREF_LL_9BIT	(mips_isa_rev >= 6)
+
 /* ISA has data indexed prefetch instructions.  This controls use of
    'prefx', along with TARGET_HARD_FLOAT and TARGET_DOUBLE_FLOAT.
    (prefx is a cop1x instruction, so can only be used if FP is
@@ -1013,19 +1136,14 @@ struct mips_cpu_info {
 #define ISA_HAS_TRUNC_W		(!ISA_MIPS1)
 
 /* ISA includes the MIPS32r2 seb and seh instructions.  */
-#define ISA_HAS_SEB_SEH		((ISA_MIPS32R2		\
-				  || ISA_MIPS64R2)	\
-				 && !TARGET_MIPS16)
+#define ISA_HAS_SEB_SEH		(mips_isa_rev >= 2 && !TARGET_MIPS16)
 
 /* ISA includes the MIPS32/64 rev 2 ext and ins instructions.  */
-#define ISA_HAS_EXT_INS		((ISA_MIPS32R2		\
-				  || ISA_MIPS64R2)	\
-				 && !TARGET_MIPS16)
+#define ISA_HAS_EXT_INS		(mips_isa_rev >= 2 && !TARGET_MIPS16)
 
 /* ISA has instructions for accessing top part of 64-bit fp regs.  */
-#define ISA_HAS_MXHC1		(TARGET_FLOAT64		\
-				 && (ISA_MIPS32R2	\
-				     || ISA_MIPS64R2))
+#define ISA_HAS_MXHC1		(!TARGET_FLOAT32	\
+				 && mips_isa_rev >= 2)
 
 /* ISA has lwxs instruction (load w/scaled index address.  */
 #define ISA_HAS_LWXS		((TARGET_SMARTMIPS || TARGET_MICROMIPS) \
@@ -1047,6 +1165,9 @@ struct mips_cpu_info {
 /* Revision 2 of the DSP ASE is available.  */
 #define ISA_HAS_DSPR2		(TARGET_DSPR2 && !TARGET_MIPS16)
 
+/* The MSA ASE is available.  */
+#define ISA_HAS_MSA		(TARGET_MSA && !TARGET_MIPS16)
+
 /* True if the result of a load is not available to the next instruction.
    A nop will then be needed between instructions like "lw $4,..."
    and "addiu $4,$4,1".  */
@@ -1078,18 +1199,13 @@ struct mips_cpu_info {
    MIPS64 and later ISAs to have the interlocks, plus any specific
    earlier-ISA CPUs for which CPU documentation declares that the
    instructions are really interlocked.  */
-#define ISA_HAS_HILO_INTERLOCKS	(ISA_MIPS32				\
-				 || ISA_MIPS32R2			\
-				 || ISA_MIPS64				\
-				 || ISA_MIPS64R2			\
+#define ISA_HAS_HILO_INTERLOCKS	(mips_isa_rev >= 1			\
 				 || TARGET_MIPS5500			\
 				 || TARGET_MIPS5900			\
 				 || TARGET_LOONGSON_2EF)
 
 /* ISA includes synci, jr.hb and jalr.hb.  */
-#define ISA_HAS_SYNCI ((ISA_MIPS32R2		\
-			|| ISA_MIPS64R2)	\
-		       && !TARGET_MIPS16)
+#define ISA_HAS_SYNCI (mips_isa_rev >= 2 && !TARGET_MIPS16)
 
 /* ISA includes sync.  */
 #define ISA_HAS_SYNC ((mips_isa >= 2 || TARGET_MIPS3900) && !TARGET_MIPS16)
@@ -1173,9 +1289,12 @@ struct mips_cpu_info {
 %{mdmx} %{mno-mdmx:-no-mdmx} \
 %{mdsp} %{mno-dsp} \
 %{mdspr2} %{mno-dspr2} \
+%{mdspr3} %{mno-dspr3} \
 %{mmcu} %{mno-mcu} \
 %{meva} %{mno-eva} \
 %{mvirt} %{mno-virt} \
+%{mxpa} %{mno-xpa} \
+%{mmsa} %{mno-msa} \
 %{msmartmips} %{mno-smartmips} \
 %{mmt} %{mno-mt} \
 %{mmxu} %{mno-mxu} \
@@ -1186,10 +1305,13 @@ struct mips_cpu_info {
 %(subtarget_asm_debugging_spec) \
 %{mabi=*} %{!mabi=*: %(asm_abi_default_spec)} \
 %{mgp32} %{mgp64} %{march=*} %{mxgot:-xgot} \
-%{mfp32} %{mfp64} %{mnan=*} \
+%{mfp32} %{mfpxx} %{mfp64} %{mnan=*} \
+%{modd-spreg} %{mno-odd-spreg} \
 %{mshared} %{mno-shared} \
 %{msym32} %{mno-sym32} \
 %{mtune=*} \
+%{mhard-float} %{msoft-float} \
+%{msingle-float} %{mdouble-float} \
 %(subtarget_asm_spec)"
 
 /* Extra switches sometimes passed to the linker.  */
@@ -1269,6 +1391,12 @@ struct mips_cpu_info {
 /* By default, turn on GDB extensions.  */
 #define DEFAULT_GDB_EXTENSIONS 1
 
+/* Registers may have a prefix which can be ignored when matching
+   user asm and register definitions.  */
+#ifndef REGISTER_PREFIX
+#define REGISTER_PREFIX    "$"
+#endif
+
 /* Local compiler-generated symbols must have a prefix that the assembler
    understands.   By default, this is $, although some targets (e.g.,
    NetBSD-ELF) need to override this.  */
@@ -1340,6 +1468,11 @@ struct mips_cpu_info {
 #define MIN_UNITS_PER_WORD 4
 #endif
 
+/* Width of a MSA vector register in bytes.  */
+#define UNITS_PER_MSA_REG 16
+/* Width of a MSA vector register in bits.  */
+#define BITS_PER_MSA_REG (UNITS_PER_MSA_REG * BITS_PER_UNIT)
+
 /* For MIPS, width of a floating point register.  */
 #define UNITS_PER_FPREG (TARGET_FLOAT64 ? 8 : 4)
 
@@ -1350,8 +1483,7 @@ struct mips_cpu_info {
 /* The number of consecutive floating-point registers needed to store the
    smallest format supported by the FPU.  */
 #define MIN_FPRS_PER_FMT \
-  (ISA_MIPS32 || ISA_MIPS32R2 || ISA_MIPS64 || ISA_MIPS64R2 \
-   ? 1 : MAX_FPRS_PER_FMT)
+  (TARGET_ODD_SPREG ? 1 : MAX_FPRS_PER_FMT)
 
 /* The largest size of value that can be held in floating-point
    registers and moved with a single instruction.  */
@@ -1392,8 +1524,10 @@ struct mips_cpu_info {
 #define LONG_LONG_ACCUM_TYPE_SIZE (TARGET_64BIT ? 128 : 64)
 
 /* long double is not a fixed mode, but the idea is that, if we
-   support long double, we also want a 128-bit integer type.  */
-#define MAX_FIXED_MODE_SIZE LONG_DOUBLE_TYPE_SIZE
+   support long double, we also want a 128-bit integer type.
+   For MSA, we support an integer type with a width of BITS_PER_MSA_REG.  */
+#define MAX_FIXED_MODE_SIZE \
+  (TARGET_MSA ? BITS_PER_MSA_REG : LONG_DOUBLE_TYPE_SIZE)
 
 #ifdef IN_LIBGCC2
 #if ((defined _ABIN32 && _MIPS_SIM == _ABIN32) \
@@ -1422,8 +1556,11 @@ struct mips_cpu_info {
 /* 8 is observed right on a DECstation and on riscos 4.02.  */
 #define STRUCTURE_SIZE_BOUNDARY 8
 
-/* There is no point aligning anything to a rounder boundary than this.  */
-#define BIGGEST_ALIGNMENT LONG_DOUBLE_TYPE_SIZE
+/* There is no point aligning anything to a rounder boundary than
+   LONG_DOUBLE_TYPE_SIZE, unless under MSA the bigggest alignment is
+   BITS_PER_MSA_REG.  */
+#define BIGGEST_ALIGNMENT \
+  (TARGET_MSA ? BITS_PER_MSA_REG : LONG_DOUBLE_TYPE_SIZE)
 
 /* All accesses must be aligned.  */
 #define STRICT_ALIGNMENT 1
@@ -1661,6 +1798,10 @@ struct mips_cpu_info {
 #define MD_REG_NUM   (MD_REG_LAST - MD_REG_FIRST + 1)
 #define MD_DBX_FIRST (FP_DBX_FIRST + FP_REG_NUM)
 
+#define MSA_REG_FIRST FP_REG_FIRST
+#define MSA_REG_LAST  FP_REG_LAST
+#define MSA_REG_NUM   FP_REG_NUM
+
 /* The DWARF 2 CFA column which tracks the return address from a
    signal handler context.  This means that to maintain backwards
    compatibility, no hard register can be assigned this column if it
@@ -1706,6 +1847,8 @@ struct mips_cpu_info {
 /* Request Interrupt Priority Level is from bit 10 to bit 15 of
    the cause register for the EIC interrupt mode.  */
 #define CAUSE_IPL	10
+/* COP1 Enable is at bit 29 of the status register */
+#define SR_COP1         29
 /* Interrupt Priority Level is from bit 10 to bit 15 of the status register.  */
 #define SR_IPL		10
 /* Exception Level is at bit 1 of the status register.  */
@@ -1744,8 +1887,11 @@ struct mips_cpu_info {
 /* Test if REGNO is hi, lo, or one of the 6 new DSP accumulators.  */
 #define ACC_REG_P(REGNO) \
   (MD_REG_P (REGNO) || DSP_ACC_REG_P (REGNO))
+#define MSA_REG_P(REGNO) \
+  ((unsigned int) ((int) (REGNO) - MSA_REG_FIRST) < MSA_REG_NUM)
 
 #define FP_REG_RTX_P(X) (REG_P (X) && FP_REG_P (REGNO (X)))
+#define MSA_REG_RTX_P(X) (REG_P (X) && MSA_REG_P (REGNO (X)))
 
 /* True if X is (const (unspec [(const_int 0)] UNSPEC_GP)).  This is used
    to initialize the mips16 gp pseudo register.  */
@@ -1766,6 +1912,18 @@ struct mips_cpu_info {
 #define HARD_REGNO_MODE_OK(REGNO, MODE)					\
   mips_hard_regno_mode_ok[ (int)(MODE) ][ (REGNO) ]
 
+/* Select a register mode required for caller save of hard regno REGNO.  */
+#define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE) \
+  mips_hard_regno_caller_save_mode (REGNO, NREGS, MODE)
+
+/* Odd-numbered single-precision registers are not considered callee-saved
+   for o32 FPXX as they will be clobbered when run on an FR=1 FPU.
+   MSA vector registers with MODE > 64 bits are part clobbered too.  */
+#define HARD_REGNO_CALL_PART_CLOBBERED(REGNO, MODE)			\
+  ((TARGET_FLOATXX && hard_regno_nregs[REGNO][MODE] == 1		\
+   && FP_REG_P (REGNO) && ((REGNO) & 1))				\
+   || (TARGET_MSA && FP_REG_P (REGNO) && GET_MODE_SIZE (MODE) > 8))
+
 #define MODES_TIEABLE_P mips_modes_tieable_p
 
 /* Register to use for pushing function arguments.  */
@@ -1871,11 +2029,14 @@ struct mips_cpu_info {
 enum reg_class
 {
   NO_REGS,			/* no registers in set */
+  M16_STORE_REGS,		/* microMIPS store registers  */
   M16_REGS,			/* mips16 directly accessible registers */
+  M16_SP_REGS,			/* mips16 + $sp */
   T_REG,			/* mips16 T register ($24) */
   M16_T_REGS,			/* mips16 registers plus T register */
   PIC_FN_ADDR_REG,		/* SVR4 PIC function address register */
   V1_REG,			/* Register $v1 ($3) used for TLS access.  */
+  SPILL_REGS,			/* All but $sp and call preserved regs are in here */
   LEA_REGS,			/* Every GPR except $25 */
   GR_REGS,			/* integer registers */
   FP_REGS,			/* floating point registers */
@@ -1908,11 +2069,14 @@ enum reg_class
 #define REG_CLASS_NAMES							\
 {									\
   "NO_REGS",								\
+  "M16_STORE_REGS",							\
   "M16_REGS",								\
+  "M16_SP_REGS",								\
   "T_REG",								\
   "M16_T_REGS",								\
   "PIC_FN_ADDR_REG",							\
   "V1_REG",								\
+  "SPILL_REGS",								\
   "LEA_REGS",								\
   "GR_REGS",								\
   "FP_REGS",								\
@@ -1948,11 +2112,14 @@ enum reg_class
 #define REG_CLASS_CONTENTS						                                \
 {									                                \
   { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* NO_REGS */		\
+  { 0x000200fc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* M16_STORE_REGS */	\
   { 0x000300fc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* M16_REGS */		\
+  { 0x200300fc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* M16_SP_REGS */		\
   { 0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* T_REG */		\
   { 0x010300fc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* M16_T_REGS */	\
   { 0x02000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* PIC_FN_ADDR_REG */	\
   { 0x00000008, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* V1_REG */		\
+  { 0x0303fffc, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* SPILL_REGS */      	\
   { 0xfdffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* LEA_REGS */		\
   { 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* GR_REGS */		\
   { 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 },	/* FP_REGS */		\
@@ -1985,7 +2152,7 @@ enum reg_class
    valid base register must belong.  A base register is one used in
    an address which is the register value plus a displacement.  */
 
-#define BASE_REG_CLASS  (TARGET_MIPS16 ? M16_REGS : GR_REGS)
+#define BASE_REG_CLASS  (TARGET_MIPS16 ? M16_SP_REGS : GR_REGS)
 
 /* A macro whose definition is the name of the class to which a
    valid index register must belong.  An index register is one used
@@ -2041,13 +2208,6 @@ enum reg_class
   182,183,184,185,186,187						\
 }
 
-/* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
-   to be rearranged based on a particular function.  On the mips16, we
-   want to allocate $24 (T_REG) before other registers for
-   instructions for which it is possible.  */
-
-#define ADJUST_REG_ALLOC_ORDER mips_order_regs_for_local_alloc ()
-
 /* True if VALUE is an unsigned 6-bit number.  */
 
 #define UIMM6_OPERAND(VALUE) \
@@ -2087,6 +2247,7 @@ enum reg_class
 #define SMALL_INT_UNSIGNED(X) SMALL_OPERAND_UNSIGNED (INTVAL (X))
 #define LUI_INT(X) LUI_OPERAND (INTVAL (X))
 #define UMIPS_12BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, -2048, 2047))
+#define MIPS_9BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, -256, 255))
 
 /* The HI and LO registers can only be reloaded via the general
    registers.  Condition code registers can only be loaded to the
@@ -2097,6 +2258,19 @@ enum reg_class
 #define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)			\
   mips_secondary_reload_class (CLASS, MODE, X, false)
 
+/* When targeting the o32 FPXX ABI, all moves with a length of doubleword
+   or greater must be performed by FR-mode-aware instructions.
+   This can be achieved using MFHC1/MTHC1 when these instructions are
+   available but otherwise moves must go via memory.
+   For the o32 FP64A ABI, all odd-numbered moves with a length of
+   doubleword or greater are required to use memory.  Using MTC1/MFC1
+   to access the lower-half of these registers would require a forbidden
+   single-precision access.  We require all double-word moves to use
+   memory because adding even and odd floating-point registers classes
+   would have a significant impact on the backend.  */
+#define SECONDARY_MEMORY_NEEDED(CLASS1, CLASS2, MODE)			\
+  mips_secondary_memory_needed ((CLASS1), (CLASS2), (MODE))
+
 /* Return the maximum number of consecutive registers
    needed to represent mode MODE in a register of class CLASS.  */
 
@@ -2210,13 +2384,31 @@ enum reg_class
 #define FP_ARG_FIRST (FP_REG_FIRST + 12)
 #define FP_ARG_LAST  (FP_ARG_FIRST + MAX_ARGS_IN_REGISTERS - 1)
 
+/* True if MODE is vector and supported in a MSA vector register.  */
+#define MSA_SUPPORTED_MODE_P(MODE)			\
+  (TARGET_MSA						\
+   && GET_MODE_SIZE (MODE) == UNITS_PER_MSA_REG		\
+   && (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT		\
+       || GET_MODE_CLASS (MODE) == MODE_VECTOR_FLOAT))
+
+/* Temporary register that is used when restoring $gp after a call.  $4 and $5
+   are used for returning complex double values in soft-float code, so $6 is the
+   first suitable candidate for TARGET_MIPS16.  For !TARGET_MIPS16 we can use
+   $gp itself as the temporary.  */
+#define POST_CALL_TMP_REG \
+  (TARGET_MIPS16 ? GP_ARG_FIRST + 2 : PIC_OFFSET_TABLE_REGNUM)
+
 /* 1 if N is a possible register number for function argument passing.
-   We have no FP argument registers when soft-float.  When FP registers
-   are 32 bits, we can't directly reference the odd numbered ones.  */
+   We have no FP argument registers when soft-float.  Special handling
+   is required for O32 where only even numbered registers are used for
+   O32-FPXX and O32-FP64.  */
 
 #define FUNCTION_ARG_REGNO_P(N)					\
   ((IN_RANGE((N), GP_ARG_FIRST, GP_ARG_LAST)			\
-    || (IN_RANGE((N), FP_ARG_FIRST, FP_ARG_LAST)))		\
+    || (IN_RANGE((N), FP_ARG_FIRST, FP_ARG_LAST) 		\
+        && (mips_abi != ABI_32 					\
+            || TARGET_FLOAT32 					\
+            || ((N) % 2 == 0))))				\
    && !fixed_regs[N])
 
 /* This structure has to cope with two different argument allocation
@@ -2422,9 +2614,11 @@ typedef struct mips_args {
 
 /* Although LDC1 and SDC1 provide 64-bit moves on 32-bit targets,
    we generally don't want to use them for copying arbitrary data.
-   A single N-word move is usually the same cost as N single-word moves.  */
-#define MOVE_MAX UNITS_PER_WORD
-#define MAX_MOVE_MAX 8
+   A single N-word move is usually the same cost as N single-word moves.
+   For MSA, we set MOVE_MAX to 16 bytes.
+   Then, MAX_MOVE_MAX is 16 unconditionally.  */
+#define MOVE_MAX (TARGET_MSA ? 16 : UNITS_PER_WORD)
+#define MAX_MOVE_MAX 16
 
 /* Define this macro as a C expression which is nonzero if
    accessing less than a word of memory (i.e. a `char' or a
@@ -2495,6 +2689,9 @@ typedef struct mips_args {
 #define MIPS_BRANCH(OPCODE, OPERANDS) \
   "%*" OPCODE "%?\t" OPERANDS "%/"
 
+#define MIPS_BRANCH_C(OPCODE, OPERANDS) \
+  "%*" OPCODE "%:\t" OPERANDS
+
 /* Return an asm string that forces INSN to be treated as an absolute
    J or JAL instruction instead of an assembler macro.  */
 #define MIPS_ABSOLUTE_JUMP(INSN) \
@@ -2502,45 +2699,6 @@ typedef struct mips_args {
    ? ".option\tpic0\n\t" INSN "\n\t.option\tpic2"		\
    : INSN)
 
-/* Return the asm template for a call.  INSN is the instruction's mnemonic
-   ("j" or "jal"), OPERANDS are its operands, TARGET_OPNO is the operand
-   number of the target.  SIZE_OPNO is the operand number of the argument size
-   operand that can optionally hold the call attributes.  If SIZE_OPNO is not
-   -1 and the call is indirect, use the function symbol from the call
-   attributes to attach a R_MIPS_JALR relocation to the call.
-
-   When generating GOT code without explicit relocation operators,
-   all calls should use assembly macros.  Otherwise, all indirect
-   calls should use "jr" or "jalr"; we will arrange to restore $gp
-   afterwards if necessary.  Finally, we can only generate direct
-   calls for -mabicalls by temporarily switching to non-PIC mode.
-
-   For microMIPS jal(r), we try to generate jal(r)s when a 16-bit
-   instruction is in the delay slot of jal(r).  */
-#define MIPS_CALL(INSN, OPERANDS, TARGET_OPNO, SIZE_OPNO)	\
-  (TARGET_USE_GOT && !TARGET_EXPLICIT_RELOCS			\
-   ? "%*" INSN "\t%" #TARGET_OPNO "%/"				\
-   : REG_P (OPERANDS[TARGET_OPNO])				\
-   ? (mips_get_pic_call_symbol (OPERANDS, SIZE_OPNO)		\
-      ? ("%*.reloc\t1f,R_MIPS_JALR,%" #SIZE_OPNO "\n"		\
-	 "1:\t" INSN "r\t%" #TARGET_OPNO "%/")			\
-      : TARGET_MICROMIPS && !TARGET_INTERLINK_COMPRESSED	\
-      ? "%*" INSN "r%!\t%" #TARGET_OPNO "%/"			\
-      : "%*" INSN "r\t%" #TARGET_OPNO "%/")			\
-   : TARGET_MICROMIPS && !TARGET_INTERLINK_COMPRESSED		\
-     ? MIPS_ABSOLUTE_JUMP ("%*" INSN "%!\t%" #TARGET_OPNO "%/")	\
-     : MIPS_ABSOLUTE_JUMP ("%*" INSN "\t%" #TARGET_OPNO "%/"))	\
-
-/* Similar to MIPS_CALL, but this is for MICROMIPS "j" to generate
-   "jrc" when nop is in the delay slot of "jr".  */
-
-#define MICROMIPS_J(INSN, OPERANDS, OPNO)			\
-  (TARGET_USE_GOT && !TARGET_EXPLICIT_RELOCS			\
-   ? "%*j\t%" #OPNO "%/"					\
-   : REG_P (OPERANDS[OPNO])					\
-   ? "%*jr%:\t%" #OPNO						\
-   : MIPS_ABSOLUTE_JUMP ("%*" INSN "\t%" #OPNO "%/"))
-
 
 /* Control the assembler format that we output.  */
 
@@ -2621,7 +2779,39 @@ typedef struct mips_args {
   { "gp",	28 + GP_REG_FIRST },					\
   { "sp",	29 + GP_REG_FIRST },					\
   { "fp",	30 + GP_REG_FIRST },					\
-  { "ra",	31 + GP_REG_FIRST }					\
+  { "ra",	31 + GP_REG_FIRST },					\
+  { "$w0",	 0 + FP_REG_FIRST },					\
+  { "$w1",	 1 + FP_REG_FIRST },					\
+  { "$w2",	 2 + FP_REG_FIRST },					\
+  { "$w3",	 3 + FP_REG_FIRST },					\
+  { "$w4",	 4 + FP_REG_FIRST },					\
+  { "$w5",	 5 + FP_REG_FIRST },					\
+  { "$w6",	 6 + FP_REG_FIRST },					\
+  { "$w7",	 7 + FP_REG_FIRST },					\
+  { "$w8",	 8 + FP_REG_FIRST },					\
+  { "$w9",	 9 + FP_REG_FIRST },					\
+  { "$w10",	10 + FP_REG_FIRST },					\
+  { "$w11",	11 + FP_REG_FIRST },					\
+  { "$w12",	12 + FP_REG_FIRST },					\
+  { "$w13",	13 + FP_REG_FIRST },					\
+  { "$w14",	14 + FP_REG_FIRST },					\
+  { "$w15",	15 + FP_REG_FIRST },					\
+  { "$w16",	16 + FP_REG_FIRST },					\
+  { "$w17",	17 + FP_REG_FIRST },					\
+  { "$w18",	18 + FP_REG_FIRST },					\
+  { "$w19",	19 + FP_REG_FIRST },					\
+  { "$w20",	20 + FP_REG_FIRST },					\
+  { "$w21",	21 + FP_REG_FIRST },					\
+  { "$w22",	22 + FP_REG_FIRST },					\
+  { "$w23",	23 + FP_REG_FIRST },					\
+  { "$w24",	24 + FP_REG_FIRST },					\
+  { "$w25",	25 + FP_REG_FIRST },					\
+  { "$w26",	26 + FP_REG_FIRST },					\
+  { "$w27",	27 + FP_REG_FIRST },					\
+  { "$w28",	28 + FP_REG_FIRST },					\
+  { "$w29",	29 + FP_REG_FIRST },					\
+  { "$w30",	30 + FP_REG_FIRST },					\
+  { "$w31",	31 + FP_REG_FIRST }					\
 }
 
 #define DBR_OUTPUT_SEQEND(STREAM)					\
@@ -2956,6 +3146,7 @@ extern const char *mips_hi_relocs[];
 extern enum processor mips_arch;        /* which cpu to codegen for */
 extern enum processor mips_tune;        /* which cpu to schedule for */
 extern int mips_isa;			/* architectural level */
+extern int mips_isa_rev;
 extern const struct mips_cpu_info *mips_arch_info;
 extern const struct mips_cpu_info *mips_tune_info;
 extern unsigned int mips_base_compression_flags;
@@ -3004,3 +3195,17 @@ extern GTY(()) struct target_globals *mips16_globals;
    with arguments ARGS.  */
 #define PMODE_INSN(NAME, ARGS) \
   (Pmode == SImode ? NAME ## _si ARGS : NAME ## _di ARGS)
+
+/* If we are *not* using multilibs and the default ABI is not ABI_32 we
+   need to change these from /lib and /usr/lib.  */
+#if MIPS_ABI_DEFAULT == ABI_N32
+#define STANDARD_STARTFILE_PREFIX_1 "/lib32/"
+#define STANDARD_STARTFILE_PREFIX_2 "/usr/lib32/"
+#elif MIPS_ABI_DEFAULT == ABI_64
+#define STANDARD_STARTFILE_PREFIX_1 "/lib64/"
+#define STANDARD_STARTFILE_PREFIX_2 "/usr/lib64/"
+#endif
+
+#define ENABLE_LD_ST_PAIRS \
+  (TARGET_LOAD_STORE_PAIRS && (TUNE_P5600 || TUNE_I6400)\
+   && !TARGET_MICROMIPS && !TARGET_FIX_24K)
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index 9bf8cb7f4df..4f4697c4e6f 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -41,6 +41,7 @@
   m4k
   octeon
   octeon2
+  octeon3
   r3900
   r6000
   r4000
@@ -65,6 +66,10 @@
   sr71000
   xlr
   xlp
+  p5600
+  w32
+  w64
+  i6400
 ])
 
 (define_c_enum "unspec" [
@@ -239,6 +244,13 @@
 	 (const_string "yes")]
 	(const_string "no")))
 
+;; True if the main data type is four times of the size of a word.
+(define_attr "qword_mode" "no,yes"
+  (cond [(and (eq_attr "mode" "TI,TF")
+	      (not (match_test "TARGET_64BIT")))
+	 (const_string "yes")]
+	(const_string "no")))
+
 ;; Attributes describing a sync loop.  These loops have the form:
 ;;
 ;;       if (RELEASE_BARRIER == YES) sync
@@ -396,6 +408,11 @@
 	 (eq_attr "move_type" "constN,shift_shift")
 	   (const_string "multi")
 
+	 ;; These types of move are split for quadword modes only.
+	 (and (eq_attr "move_type" "move,const")
+	      (eq_attr "qword_mode" "yes"))
+	   (const_string "multi")
+
 	 ;; These types of move are split for doubleword modes only.
 	 (and (eq_attr "move_type" "move,const")
 	      (eq_attr "dword_mode" "yes"))
@@ -405,6 +422,15 @@
 	 (eq_attr "sync_mem" "!none") (const_string "syncloop")]
 	(const_string "unknown")))
 
+(define_attr "compact_form" "always,maybe,never"
+  (cond [(eq_attr "jal" "direct")
+	 (const_string "always")
+	 (eq_attr "jal" "indirect")
+	 (const_string "maybe")
+	 (eq_attr "type" "jump")
+	 (const_string "maybe")]
+	(const_string "never")))
+
 ;; Mode for conversion types (fcvt)
 ;; I2S          integer to float single (SI/DI to SF)
 ;; I2D          integer to float double (SI/DI to DF)
@@ -427,15 +453,21 @@
 		(const_string "yes")
 		(const_string "no")))
 
-(define_attr "compression" "none,all,micromips"
+(define_attr "compression" "none,all,micromips32,micromips"
   (const_string "none"))
 
 (define_attr "enabled" "no,yes"
-  (if_then_else (ior (eq_attr "compression" "all,none")
-		     (and (eq_attr "compression" "micromips")
-	                  (match_test "TARGET_MICROMIPS")))
-	        (const_string "yes")
-	        (const_string "no")))
+  (cond [;; The o32 FPXX and FP64A ABI extensions prohibit direct moves between
+	 ;; GR_REG and FR_REG for 64-bit values.
+	 (and (eq_attr "move_type" "mtc,mfc")
+	      (match_test "(TARGET_FLOATXX && !ISA_HAS_MXHC1)
+			   || TARGET_O32_FP64A_ABI")
+	      (eq_attr "dword_mode" "yes"))
+	 (const_string "no")
+	 (and (eq_attr "compression" "micromips32,micromips")
+	      (match_test "!TARGET_MICROMIPS"))
+	 (const_string "no")]
+	(const_string "yes")))
 
 ;; The number of individual instructions that a non-branch pattern generates,
 ;; using units of BASE_INSN_LENGTH.
@@ -467,6 +499,12 @@
 	      (eq_attr "dword_mode" "yes"))
 	 (const_int 2)
 
+	 ;; Check for quadword moves that are decomposed into four
+	 ;; instructions.
+	 (and (eq_attr "move_type" "mtc,mfc,move")
+	      (eq_attr "qword_mode" "yes"))
+	 (const_int 4)
+
 	 ;; Constants, loads and stores are handled by external routines.
 	 (and (eq_attr "move_type" "const,constN")
 	      (eq_attr "dword_mode" "yes"))
@@ -508,7 +546,9 @@
 	 (const_int 2)
 
 	 (eq_attr "type" "idiv,idiv3")
-	 (symbol_ref "mips_idiv_insns ()")
+	 (cond [(eq_attr "mode" "TI")
+		(symbol_ref "mips_msa_idiv_insns () * 4")]
+	        (symbol_ref "mips_idiv_insns () * 4"))
 
 	 (not (eq_attr "sync_mem" "none"))
 	 (symbol_ref "mips_sync_loop_insns (insn, operands)")]
@@ -518,7 +558,9 @@
 ;; but there are special cases for branches (which must be handled here)
 ;; and for compressed single instructions.
 (define_attr "length" ""
-   (cond [(and (eq_attr "compression" "micromips,all")
+   (cond [(and (ior (eq_attr "compression" "micromips,all")
+		    (and (eq_attr "compression" "micromips32")
+			 (eq_attr "mode" "SI,SF")))
 	       (eq_attr "dword_mode" "no")
 	       (match_test "TARGET_MICROMIPS"))
 	  (const_int 2)
@@ -682,7 +724,7 @@
 ;; DELAY means that the next instruction cannot read the result
 ;; of this one.  HILO means that the next two instructions cannot
 ;; write to HI or LO.
-(define_attr "hazard" "none,delay,hilo"
+(define_attr "hazard" "none,delay,hilo,forbidden_slot"
   (cond [(and (eq_attr "type" "load,fpload,fpidxload")
 	      (match_test "ISA_HAS_LOAD_DELAY"))
 	 (const_string "delay")
@@ -742,6 +784,11 @@
 
 (define_mode_iterator MOVEP1 [SI SF])
 (define_mode_iterator MOVEP2 [SI SF])
+(define_mode_iterator JOIN_MODE [
+				 HI
+				 SI
+				 (SF "TARGET_HARD_FLOAT")
+				 (DF "TARGET_HARD_FLOAT && TARGET_DOUBLE_FLOAT")])
 
 ;; This mode iterator allows :HILO to be used as the mode of the
 ;; concatenated HI and LO registers.
@@ -758,6 +805,11 @@
 				   && !TARGET_LOONGSON_2EF
 				   && !TARGET_MIPS5900")])
 
+;; This mode iterator allows :FPCC to be used anywhere that an FP condition
+;; is needed.
+(define_mode_iterator FPCC [(CC "!ISA_HAS_CCF")
+			    (CCF "ISA_HAS_CCF")])
+
 ;; 32-bit integer moves for which we provide move patterns.
 (define_mode_iterator IMOVE32
   [SI
@@ -847,14 +899,16 @@
 
 ;; This attribute gives the best constraint to use for registers of
 ;; a given mode.
-(define_mode_attr reg [(SI "d") (DI "d") (CC "z")])
+(define_mode_attr reg [(SI "d") (DI "d") (CC "z") (CCF "f")])
 
 ;; This attribute gives the format suffix for floating-point operations.
 (define_mode_attr fmt [(SF "s") (DF "d") (V2SF "ps")])
 
 ;; This attribute gives the upper-case mode name for one unit of a
-;; floating-point mode.
-(define_mode_attr UNITMODE [(SF "SF") (DF "DF") (V2SF "SF")])
+;; floating-point mode or vector mode.
+(define_mode_attr UNITMODE [(SF "SF") (DF "DF") (V2SF "SF") (V4SF "SF")
+			    (V16QI "QI") (V8HI "HI") (V4SI "SI") (V2DI "DI")
+			    (V2DF "DF")])
 
 ;; This attribute gives the integer mode that has the same size as a
 ;; fixed-point mode.
@@ -887,6 +941,9 @@
 (define_mode_attr sqrt_condition
   [(SF "!ISA_MIPS1") (DF "!ISA_MIPS1") (V2SF "TARGET_SB1")])
 
+;; This attribute provides the correct mnemonic for each FP condition mode.
+(define_mode_attr fpcmp [(CC "c") (CCF "cmp")])
+
 ;; This code iterator allows signed and unsigned widening multiplications
 ;; to use the same template.
 (define_code_iterator any_extend [sign_extend zero_extend])
@@ -909,7 +966,10 @@
 
 ;; This code iterator allows all native floating-point comparisons to be
 ;; generated from the same template.
-(define_code_iterator fcond [unordered uneq unlt unle eq lt le])
+(define_code_iterator fcond [unordered uneq unlt unle eq lt le
+			     (ordered "ISA_HAS_CCF")
+			     (ltgt "ISA_HAS_CCF")
+			     (ne "ISA_HAS_CCF")])
 
 ;; This code iterator is used for comparisons that can be implemented
 ;; by swapping the operands.
@@ -971,8 +1031,8 @@
 				  (xor "xori")
 				  (and "andi")])
 
-(define_code_attr shift_compression [(ashift "micromips")
-				     (lshiftrt "micromips")
+(define_code_attr shift_compression [(ashift "micromips32")
+				     (lshiftrt "micromips32")
 				     (ashiftrt "none")])
 
 ;; <fcond> is the c.cond.fmt condition associated with a particular code.
@@ -982,7 +1042,10 @@
 			 (unle "ule")
 			 (eq "eq")
 			 (lt "lt")
-			 (le "le")])
+			 (le "le")
+			 (ordered "or")
+			 (ltgt "ne")
+			 (ne "une")])
 
 ;; Similar, but for swapped conditions.
 (define_code_attr swapped_fcond [(ge "le")
@@ -996,6 +1059,10 @@
 
 ;; This is the inverse value of bbv.
 (define_code_attr bbinv [(eq "1") (ne "0")])
+
+;; The sel mnemonic to use depending on the condition test.
+(define_code_attr sel [(eq "seleqz") (ne "selnez")])
+(define_code_attr selinv [(eq "selnez") (ne "seleqz")])
 
 ;; .........................
 ;;
@@ -1010,21 +1077,36 @@
    (nil)
    (eq_attr "can_delay" "yes")])
 
-;; Branches that don't have likely variants do not annul on false.
+;; Branches that have delay slots and don't have likely variants do
+;; not annul on false.
 (define_delay (and (eq_attr "type" "branch")
 		   (not (match_test "TARGET_MIPS16"))
+		   (ior (match_test "TARGET_CB_NEVER")
+			(and (eq_attr "compact_form" "maybe")
+			     (not (match_test "TARGET_CB_ALWAYS")))
+			(eq_attr "compact_form" "never"))
 		   (eq_attr "branch_likely" "no"))
   [(eq_attr "can_delay" "yes")
    (nil)
    (nil)])
 
-(define_delay (eq_attr "type" "jump")
+(define_delay (and (eq_attr "type" "jump")
+		   (ior (match_test "TARGET_CB_NEVER")
+			(and (eq_attr "compact_form" "maybe")
+			     (not (match_test "TARGET_CB_ALWAYS")))
+			(eq_attr "compact_form" "never")))
   [(eq_attr "can_delay" "yes")
    (nil)
    (nil)])
 
+;; Call type instructions will never have a compact form as the
+;; type is only used for MIPS16 patterns
 (define_delay (and (eq_attr "type" "call")
-		   (eq_attr "jal_macro" "no"))
+		   (eq_attr "jal_macro" "no")
+		   (ior (match_test "TARGET_CB_NEVER")
+			(and (eq_attr "compact_form" "maybe")
+			     (not (match_test "TARGET_CB_ALWAYS")))
+			(eq_attr "compact_form" "never")))
   [(eq_attr "can_delay" "yes")
    (nil)
    (nil)])
@@ -1050,6 +1132,8 @@
   (eq_attr "type" "ghost")
   "nothing")
 
+(include "i6400.md")
+(include "p5600.md")
 (include "4k.md")
 (include "5k.md")
 (include "20kc.md")
@@ -1103,18 +1187,27 @@
 			    [(match_operand:GPR 1 "reg_or_0_operand")
 			     (match_operand:GPR 2 "arith_operand")])
 	    (match_operand 3 "const_0_operand"))]
-  "ISA_HAS_COND_TRAP"
+  "ISA_HAS_COND_TRAPI || ISA_HAS_COND_TRAP"
 {
   mips_expand_conditional_trap (operands[0]);
   DONE;
 })
 
+(define_insn "*conditional_trap_reg<mode>"
+  [(trap_if (match_operator:GPR 0 "trap_comparison_operator"
+				[(match_operand:GPR 1 "reg_or_0_operand" "dJ")
+				 (match_operand:GPR 2 "reg_or_0_operand" "dJ")])
+	    (const_int 0))]
+  "ISA_HAS_COND_TRAP && !ISA_HAS_COND_TRAPI"
+  "t%C0\t%z1,%2"
+  [(set_attr "type" "trap")])
+
 (define_insn "*conditional_trap<mode>"
   [(trap_if (match_operator:GPR 0 "trap_comparison_operator"
 				[(match_operand:GPR 1 "reg_or_0_operand" "dJ")
 				 (match_operand:GPR 2 "arith_operand" "dI")])
 	    (const_int 0))]
-  "ISA_HAS_COND_TRAP"
+  "ISA_HAS_COND_TRAPI"
   "t%C0\t%z1,%2"
   [(set_attr "type" "trap")])
 
@@ -1154,7 +1247,7 @@
     return "<d>addiu\t%0,%1,%2";
 }
   [(set_attr "alu_type" "add")
-   (set_attr "compression" "micromips,*,micromips,micromips,micromips,micromips,*")
+   (set_attr "compression" "micromips32,*,micromips32,micromips32,micromips32,micromips32,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*add<mode>3_mips16"
@@ -1372,7 +1465,7 @@
   ""
   "<d>subu\t%0,%1,%2"
   [(set_attr "alu_type" "sub")
-   (set_attr "compression" "micromips,*")
+   (set_attr "compression" "micromips32,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*subsi3_extended"
@@ -1482,13 +1575,13 @@
   [(set (match_operand:GPR 0 "register_operand")
 	(mult:GPR (match_operand:GPR 1 "register_operand")
 		  (match_operand:GPR 2 "register_operand")))]
-  "ISA_HAS_<D>MULT"
+  "ISA_HAS_<D>MULT || ISA_HAS_R6<D>MUL"
 {
   rtx lo;
 
-  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A)
-    emit_insn (gen_mul<mode>3_mul3_loongson (operands[0], operands[1],
-                                             operands[2]));
+  if (TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>MUL)
+    emit_insn (gen_mul<mode>3_mul3_nohilo (operands[0], operands[1],
+					   operands[2]));
   else if (ISA_HAS_<D>MUL3)
     emit_insn (gen_mul<mode>3_mul3 (operands[0], operands[1], operands[2]));
   else if (TARGET_MIPS16)
@@ -1505,16 +1598,18 @@
   DONE;
 })
 
-(define_insn "mul<mode>3_mul3_loongson"
+(define_insn "mul<mode>3_mul3_nohilo"
   [(set (match_operand:GPR 0 "register_operand" "=d")
         (mult:GPR (match_operand:GPR 1 "register_operand" "d")
                   (match_operand:GPR 2 "register_operand" "d")))]
-  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A"
+  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>MUL"
 {
   if (TARGET_LOONGSON_2EF)
     return "<d>multu.g\t%0,%1,%2";
-  else
+  else if (TARGET_LOONGSON_3A)
     return "gs<d>multu\t%0,%1,%2";
+  else
+    return "<d>mul\t%0,%1,%2";
 }
   [(set_attr "type" "imul3nc")
    (set_attr "mode" "<MODE>")])
@@ -1622,40 +1717,66 @@
 ;; copy instructions.  Reload therefore thinks that the second alternative
 ;; is two reloads more costly than the first.  We add "*?*?" to the first
 ;; alternative as a counterweight.
+;;
+;; LRA simulates reload but the cost of reloading scratches is lower
+;; than of the classic reload. For the time being, removing the counterweight
+;; for LRA is more profitable.
 (define_insn "*mul_acc_si"
-  [(set (match_operand:SI 0 "register_operand" "=l*?*?,d?")
-	(plus:SI (mult:SI (match_operand:SI 1 "register_operand" "d,d")
-			  (match_operand:SI 2 "register_operand" "d,d"))
-		 (match_operand:SI 3 "register_operand" "0,d")))
-   (clobber (match_scratch:SI 4 "=X,l"))
-   (clobber (match_scratch:SI 5 "=X,&d"))]
+  [(set (match_operand:SI 0 "register_operand" "=l*?*?,l,d?")
+	(plus:SI (mult:SI (match_operand:SI 1 "register_operand" "d,d,d")
+			  (match_operand:SI 2 "register_operand" "d,d,d"))
+		 (match_operand:SI 3 "register_operand" "0,0,d")))
+   (clobber (match_scratch:SI 4 "=X,X,l"))
+   (clobber (match_scratch:SI 5 "=X,X,&d"))]
   "GENERATE_MADD_MSUB && !TARGET_MIPS16"
   "@
     madd\t%1,%2
+    madd\t%1,%2
     #"
   [(set_attr "type"	"imadd")
    (set_attr "accum_in"	"3")
    (set_attr "mode"	"SI")
-   (set_attr "insn_count" "1,2")])
+   (set_attr "insn_count" "1,1,2")
+   (set (attr "enabled")
+        (cond [(and (eq_attr "alternative" "0")
+                    (match_test "!mips_lra_flag"))
+                  (const_string "yes")
+               (and (eq_attr "alternative" "1")
+                    (match_test "mips_lra_flag"))
+                  (const_string "yes")
+               (eq_attr "alternative" "2")
+                  (const_string "yes")]
+              (const_string "no")))])
 
 ;; The same idea applies here.  The middle alternative needs one less
 ;; clobber than the final alternative, so we add "*?" as a counterweight.
 (define_insn "*mul_acc_si_r3900"
-  [(set (match_operand:SI 0 "register_operand" "=l*?*?,d*?,d?")
-	(plus:SI (mult:SI (match_operand:SI 1 "register_operand" "d,d,d")
-			  (match_operand:SI 2 "register_operand" "d,d,d"))
-		 (match_operand:SI 3 "register_operand" "0,l,d")))
-   (clobber (match_scratch:SI 4 "=X,3,l"))
-   (clobber (match_scratch:SI 5 "=X,X,&d"))]
+  [(set (match_operand:SI 0 "register_operand" "=l*?*?,l,d*?,d?")
+	(plus:SI (mult:SI (match_operand:SI 1 "register_operand" "d,d,d,d")
+			  (match_operand:SI 2 "register_operand" "d,d,d,d"))
+		 (match_operand:SI 3 "register_operand" "0,0,l,d")))
+   (clobber (match_scratch:SI 4 "=X,X,3,l"))
+   (clobber (match_scratch:SI 5 "=X,X,X,&d"))]
   "TARGET_MIPS3900 && !TARGET_MIPS16"
   "@
     madd\t%1,%2
+    madd\t%1,%2
     madd\t%0,%1,%2
     #"
   [(set_attr "type"	"imadd")
    (set_attr "accum_in"	"3")
    (set_attr "mode"	"SI")
-   (set_attr "insn_count" "1,1,2")])
+   (set_attr "insn_count" "1,1,1,2")
+   (set (attr "enabled")
+        (cond [(and (eq_attr "alternative" "0")
+                    (match_test "!mips_lra_flag"))
+                  (const_string "yes")
+               (and (eq_attr "alternative" "1")
+                    (match_test "mips_lra_flag"))
+                  (const_string "yes")
+               (eq_attr "alternative" "2,3")
+                  (const_string "yes")]
+              (const_string "no")))])
 
 ;; Split *mul_acc_si if both the source and destination accumulator
 ;; values are GPRs.
@@ -1859,20 +1980,31 @@
 
 ;; See the comment above *mul_add_si for details.
 (define_insn "*mul_sub_si"
-  [(set (match_operand:SI 0 "register_operand" "=l*?*?,d?")
-        (minus:SI (match_operand:SI 1 "register_operand" "0,d")
-                  (mult:SI (match_operand:SI 2 "register_operand" "d,d")
-                           (match_operand:SI 3 "register_operand" "d,d"))))
-   (clobber (match_scratch:SI 4 "=X,l"))
-   (clobber (match_scratch:SI 5 "=X,&d"))]
+  [(set (match_operand:SI 0 "register_operand" "=l*?*?,l,d?")
+        (minus:SI (match_operand:SI 1 "register_operand" "0,0,d")
+                  (mult:SI (match_operand:SI 2 "register_operand" "d,d,d")
+                           (match_operand:SI 3 "register_operand" "d,d,d"))))
+   (clobber (match_scratch:SI 4 "=X,X,l"))
+   (clobber (match_scratch:SI 5 "=X,X,&d"))]
   "GENERATE_MADD_MSUB"
   "@
    msub\t%2,%3
+   msub\t%2,%3
    #"
   [(set_attr "type"     "imadd")
    (set_attr "accum_in"	"1")
    (set_attr "mode"     "SI")
-   (set_attr "insn_count" "1,2")])
+   (set_attr "insn_count" "1,1,2")
+   (set (attr "enabled")
+        (cond [(and (eq_attr "alternative" "0")
+                    (match_test "!mips_lra_flag"))
+                  (const_string "yes")
+               (and (eq_attr "alternative" "1")
+                    (match_test "mips_lra_flag"))
+                  (const_string "yes")
+               (eq_attr "alternative" "2")
+                  (const_string "yes")]
+              (const_string "no")))])
 
 ;; Split *mul_sub_si if both the source and destination accumulator
 ;; values are GPRs.
@@ -1913,6 +2045,24 @@
   DONE;
 })
 
+(define_expand "<u>mulsidi3_32bit_r6"
+  [(set (match_operand:DI 0 "register_operand")
+	(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
+		 (any_extend:DI (match_operand:SI 2 "register_operand"))))]
+  "!TARGET_64BIT && ISA_HAS_R6MUL"
+{
+  rtx dest = gen_reg_rtx (DImode);
+  rtx low = mips_subword (dest, 0);
+  rtx high = mips_subword (dest, 1);
+
+  emit_insn (gen_mulsi3_mul3_nohilo (low, operands[1], operands[2]));
+  emit_insn (gen_<su>mulsi3_highpart_r6 (high, operands[1], operands[2]));
+
+  emit_move_insn (mips_subword (operands[0], 0), low);
+  emit_move_insn (mips_subword (operands[0], 1), high);
+  DONE;
+})
+
 (define_expand "<u>mulsidi3_32bit_mips16"
   [(set (match_operand:DI 0 "register_operand")
 	(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand"))
@@ -1934,7 +2084,7 @@
   [(set (match_operand:DI 0 "muldiv_target_operand" "=ka")
 	(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
 		 (any_extend:DI (match_operand:SI 2 "register_operand" "d"))))]
-  "!TARGET_64BIT && (!TARGET_FIX_R4000 || ISA_HAS_DSP)"
+  "!TARGET_64BIT && ((!TARGET_FIX_R4000 && ISA_HAS_MULT) || ISA_HAS_DSP)"
 {
   if (ISA_HAS_DSP_MULT)
     return "mult<u>\t%q0,%1,%2";
@@ -1949,7 +2099,7 @@
 	(mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
 		 (any_extend:DI (match_operand:SI 2 "register_operand" "d"))))
    (clobber (match_scratch:DI 3 "=x"))]
-  "!TARGET_64BIT && TARGET_FIX_R4000 && !ISA_HAS_DSP"
+  "!TARGET_64BIT && TARGET_FIX_R4000 && !ISA_HAS_DSP && ISA_HAS_MULT"
   "mult<u>\t%1,%2\;mflo\t%L0\;mfhi\t%M0"
   [(set_attr "type" "imul")
    (set_attr "mode" "SI")
@@ -1961,7 +2111,8 @@
 		 (any_extend:DI (match_operand:SI 2 "register_operand" "d"))))
    (clobber (match_scratch:TI 3 "=x"))
    (clobber (match_scratch:DI 4 "=d"))]
-  "TARGET_64BIT && !TARGET_FIX_R4000 && !ISA_HAS_DMUL3 && !TARGET_MIPS16"
+  "TARGET_64BIT && !TARGET_FIX_R4000 && !ISA_HAS_DMUL3
+   && !TARGET_MIPS16 && ISA_HAS_MULT"
   "#"
   "&& reload_completed"
   [(const_int 0)]
@@ -2044,6 +2195,15 @@
   [(set_attr "type" "imul3")
    (set_attr "mode" "DI")])
 
+(define_insn "mulsidi3_64bit_r6dmul"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(mult:DI (sign_extend:DI (match_operand:SI 1 "register_operand" "d"))
+		 (sign_extend:DI (match_operand:SI 2 "register_operand" "d"))))]
+  "ISA_HAS_R6DMUL"
+  "dmul\t%0,%1,%2"
+  [(set_attr "type" "imul3nc")
+   (set_attr "mode" "DI")])
+
 ;; Widening multiply with negation.
 (define_insn "*muls<u>_di"
   [(set (match_operand:DI 0 "muldiv_target_operand" "=x")
@@ -2101,12 +2261,27 @@
   else if (TARGET_MIPS16)
     emit_insn (gen_<su>mulsi3_highpart_split (operands[0], operands[1],
 					      operands[2]));
+  else if (ISA_HAS_R6MUL)
+    emit_insn (gen_<su>mulsi3_highpart_r6 (operands[0], operands[1],
+					   operands[2]));
   else
     emit_insn (gen_<su>mulsi3_highpart_internal (operands[0], operands[1],
 					         operands[2]));
   DONE;
 })
 
+(define_insn "<su>mulsi3_highpart_r6"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(truncate:SI
+	 (lshiftrt:DI
+	  (mult:DI (any_extend:DI (match_operand:SI 1 "register_operand" "d"))
+		   (any_extend:DI (match_operand:SI 2 "register_operand" "d")))
+	  (const_int 32))))]
+  "ISA_HAS_R6MUL"
+  "muh<u>\t%0,%1,%2"
+  [(set_attr "type" "imul3nc")
+   (set_attr "mode" "SI")])
+
 (define_insn_and_split "<su>mulsi3_highpart_internal"
   [(set (match_operand:SI 0 "register_operand" "=d")
 	(truncate:SI
@@ -2115,7 +2290,7 @@
 		   (any_extend:DI (match_operand:SI 2 "register_operand" "d")))
 	  (const_int 32))))
    (clobber (match_scratch:SI 3 "=l"))]
-  "!ISA_HAS_MULHI && !TARGET_MIPS16"
+  "ISA_HAS_MULT && !ISA_HAS_MULHI && !TARGET_MIPS16"
   { return TARGET_FIX_R4000 ? "mult<u>\t%1,%2\n\tmfhi\t%0" : "#"; }
   "&& reload_completed && !TARGET_FIX_R4000"
   [(const_int 0)]
@@ -2193,17 +2368,34 @@
 	  (mult:TI (any_extend:TI (match_operand:DI 1 "register_operand"))
 		   (any_extend:TI (match_operand:DI 2 "register_operand")))
 	  (const_int 64))))]
-  "ISA_HAS_DMULT && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120)"
+  "ISA_HAS_R6DMUL
+   || (ISA_HAS_DMULT
+       && !(<CODE> == ZERO_EXTEND && TARGET_FIX_VR4120))"
 {
   if (TARGET_MIPS16)
     emit_insn (gen_<su>muldi3_highpart_split (operands[0], operands[1],
 					      operands[2]));
+  else if (ISA_HAS_R6DMUL)
+    emit_insn (gen_<su>muldi3_highpart_r6 (operands[0], operands[1],
+					   operands[2]));
   else
     emit_insn (gen_<su>muldi3_highpart_internal (operands[0], operands[1],
 						 operands[2]));
   DONE;
 })
 
+(define_insn "<su>muldi3_highpart_r6"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(truncate:DI
+	 (lshiftrt:TI
+	  (mult:TI (any_extend:TI (match_operand:DI 1 "register_operand" "d"))
+		   (any_extend:TI (match_operand:DI 2 "register_operand" "d")))
+	  (const_int 64))))]
+  "ISA_HAS_R6DMUL"
+  "dmuh<u>\t%0,%1,%2"
+  [(set_attr "type" "imul3nc")
+   (set_attr "mode" "DI")])
+
 (define_insn_and_split "<su>muldi3_highpart_internal"
   [(set (match_operand:DI 0 "register_operand" "=d")
 	(truncate:DI
@@ -2342,6 +2534,16 @@
    (set_attr "accum_in"	"3")
    (set_attr "mode" "<UNITMODE>")])
 
+(define_insn "fma<mode>4"
+  [(set (match_operand:ANYF 0 "register_operand" "=f")
+	(fma:ANYF (match_operand:ANYF 1 "register_operand" "f")
+		  (match_operand:ANYF 2 "register_operand" "f")
+		  (match_operand:ANYF 3 "register_operand" "0")))]
+  "ISA_HAS_FP_MADDF_MSUBF"
+  "maddf.<fmt>\t%0,%1,%2"
+  [(set_attr "type" "fmadd")
+   (set_attr "mode" "<UNITMODE>")])
+
 (define_insn "*madd3<mode>"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
 	(plus:ANYF (mult:ANYF (match_operand:ANYF 1 "register_operand" "f")
@@ -2725,6 +2927,40 @@
   { return mips_output_division ("<GPR:d>div<u>\t%.,%1,%2", operands); }
   [(set_attr "type" "idiv")
    (set_attr "mode" "<GPR:MODE>")])
+
+;; Integer division and modulus.
+
+(define_insn "<u>div<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=&d")
+	(any_div:GPR (match_operand:GPR 1 "register_operand" "d")
+		     (match_operand:GPR 2 "register_operand" "d")))]
+  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>DIV"
+  {
+    if (TARGET_LOONGSON_2EF)
+      return mips_output_division ("<d>div<u>.g\t%0,%1,%2", operands);
+    else if (TARGET_LOONGSON_3A)
+      return mips_output_division ("gs<d>div<u>\t%0,%1,%2", operands);
+    else
+      return mips_output_division ("<d>div<u>\t%0,%1,%2", operands);
+  }
+  [(set_attr "type" "idiv3")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<u>mod<mode>3"
+  [(set (match_operand:GPR 0 "register_operand" "=&d")
+	(any_mod:GPR (match_operand:GPR 1 "register_operand" "d")
+		     (match_operand:GPR 2 "register_operand" "d")))]
+  "TARGET_LOONGSON_2EF || TARGET_LOONGSON_3A || ISA_HAS_R6<D>DIV"
+  {
+    if (TARGET_LOONGSON_2EF)
+      return mips_output_division ("<d>mod<u>.g\t%0,%1,%2", operands);
+    else if (TARGET_LOONGSON_3A)
+      return mips_output_division ("gs<d>mod<u>\t%0,%1,%2", operands);
+    else
+      return mips_output_division ("<d>mod<u>\t%0,%1,%2", operands);
+  }
+  [(set_attr "type" "idiv3")
+   (set_attr "mode" "<MODE>")])
 
 ;;
 ;;  ....................
@@ -3870,7 +4106,7 @@
 	(sign_extract:GPR (match_operand:BLK 1 "memory_operand")
 			  (match_operand 2 "const_int_operand")
 			  (match_operand 3 "const_int_operand")))]
-  "!TARGET_MIPS16"
+  "ISA_HAS_LWL_LWR"
 {
   if (mips_expand_ext_as_unaligned_load (operands[0], operands[1],
 					 INTVAL (operands[2]),
@@ -3907,7 +4143,7 @@
 	(zero_extract:GPR (match_operand:BLK 1 "memory_operand")
 			  (match_operand 2 "const_int_operand")
 			  (match_operand 3 "const_int_operand")))]
-  "!TARGET_MIPS16"
+  "ISA_HAS_LWL_LWR"
 {
   if (mips_expand_ext_as_unaligned_load (operands[0], operands[1],
 					 INTVAL (operands[2]),
@@ -3958,7 +4194,7 @@
 			  (match_operand 1 "const_int_operand")
 			  (match_operand 2 "const_int_operand"))
 	(match_operand:GPR 3 "reg_or_0_operand"))]
-  "!TARGET_MIPS16"
+  "ISA_HAS_LWL_LWR"
 {
   if (mips_expand_ins_as_unaligned_store (operands[0], operands[3],
 					  INTVAL (operands[1]),
@@ -4139,7 +4375,10 @@
   [(set (match_operand:DI 0 "register_operand" "=d")
 	(match_operand:DI 1 "absolute_symbolic_operand" ""))
    (clobber (match_scratch:DI 2 "=&d"))]
-  "TARGET_EXPLICIT_RELOCS && ABI_HAS_64BIT_SYMBOLS && cse_not_expected"
+  "!TARGET_MIPS16
+   && TARGET_EXPLICIT_RELOCS
+   && ABI_HAS_64BIT_SYMBOLS
+   && cse_not_expected"
   "#"
   "&& reload_completed"
   [(set (match_dup 0) (high:DI (match_dup 3)))
@@ -4437,7 +4676,7 @@
 
 (define_insn "*mov<mode>_internal"
   [(set (match_operand:IMOVE32 0 "nonimmediate_operand" "=d,!u,!u,d,e,!u,!ks,d,ZS,ZT,m,*f,*f,*d,*m,*d,*z,*a,*d,*B*C*D,*B*C*D,*d,*m")
-	(match_operand:IMOVE32 1 "move_operand" "d,J,Udb7,Yd,Yf,ZT,ZS,m,!ks,!u,dJ,*d*J,*m,*f,*f,*z,*d,*J*d,*a,*d,*m,*B*C*D,*B*C*D"))]
+	(match_operand:IMOVE32 1 "move_operand" "d,J,Udb7,Yd,Yf,ZT,ZS,m,!ks,!kbJ,dJ,*d*J,*m,*f,*f,*z,*d,*J*d,*a,*d,*m,*B*C*D,*B*C*D"))]
   "!TARGET_MIPS16
    && (register_operand (operands[0], <MODE>mode)
        || reg_or_0_operand (operands[1], <MODE>mode))"
@@ -4578,7 +4817,7 @@
 
 (define_insn "*movhi_internal"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=d,!u,d,!u,d,ZU,m,*a,*d")
-	(match_operand:HI 1 "move_operand"         "d,J,I,ZU,m,!u,dJ,*d*J,*a"))]
+	(match_operand:HI 1 "move_operand"         "d,J,I,ZU,m,!kbJ,dJ,*d*J,*a"))]
   "!TARGET_MIPS16
    && (register_operand (operands[0], HImode)
        || reg_or_0_operand (operands[1], HImode))"
@@ -4654,7 +4893,7 @@
 
 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=d,!u,d,!u,d,ZV,m,*a,*d")
-	(match_operand:QI 1 "move_operand"         "d,J,I,ZW,m,!u,dJ,*d*J,*a"))]
+	(match_operand:QI 1 "move_operand"         "d,J,I,ZW,m,!kbJ,dJ,*d*J,*a"))]
   "!TARGET_MIPS16
    && (register_operand (operands[0], QImode)
        || reg_or_0_operand (operands[1], QImode))"
@@ -4711,6 +4950,13 @@
     DONE;
 })
 
+(define_insn "movccf"
+  [(set (match_operand:CCF 0 "nonimmediate_operand" "=f,f,m")
+	(match_operand:CCF 1 "nonimmediate_operand" "f,m,f"))]
+  "ISA_HAS_CCF"
+  { return mips_output_move (operands[0], operands[1]); }
+  [(set_attr "move_type" "fmove,fpload,fpstore")])
+
 (define_insn "*movsf_hardfloat"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f,m,m,*f,*d,*d,*d,*m")
 	(match_operand:SF 1 "move_operand" "f,G,m,f,G,*d,*f,*G*d,*m,*d"))]
@@ -4920,7 +5166,8 @@
 	(unspec:GPR [(match_operand:HILO 1 "hilo_operand" "x")]
 		    UNSPEC_MFHI))]
   ""
-  { return ISA_HAS_MACCHI ? "<GPR:d>macchi\t%0,%.,%." : "mfhi\t%0"; }
+  { return ISA_HAS_MACCHI ? "<GPR:d>macchi\t%0,%.,%." :
+	                       ISA_HAS_MULT ? "mfhi\t%0" : "mfhi\t%0,$ac0"; }
   [(set_attr "type" "mfhi")
    (set_attr "mode" "<GPR:MODE>")])
 
@@ -4933,7 +5180,12 @@
 		      (match_operand:GPR 2 "register_operand" "l")]
 		     UNSPEC_MTHI))]
   ""
-  "mthi\t%z1"
+  {
+    if (ISA_HAS_MULT)
+      return "mthi\t%z1";
+    else
+      return "mthi\t%z1, $ac0";
+  }
   [(set_attr "type" "mthi")
    (set_attr "mode" "SI")])
 
@@ -4954,7 +5206,7 @@
       rtx low = mips_subword (operands[1], 0);
       rtx high = mips_subword (operands[1], 1);
       emit_insn (gen_load_low<mode> (operands[0], low));
-      if (TARGET_FLOAT64 && !TARGET_64BIT)
+      if (ISA_HAS_MXHC1 && !TARGET_64BIT)
       	emit_insn (gen_mthc1<mode> (operands[0], high, operands[0]));
       else
 	emit_insn (gen_load_high<mode> (operands[0], high, operands[0]));
@@ -4964,7 +5216,7 @@
       rtx low = mips_subword (operands[0], 0);
       rtx high = mips_subword (operands[0], 1);
       emit_insn (gen_store_word<mode> (low, operands[1], const0_rtx));
-      if (TARGET_FLOAT64 && !TARGET_64BIT)
+      if (ISA_HAS_MXHC1 && !TARGET_64BIT)
 	emit_insn (gen_mfhc1<mode> (high, operands[1]));
       else
 	emit_insn (gen_store_word<mode> (high, operands[1], const1_rtx));
@@ -5229,7 +5481,7 @@
 (define_insn "mips_cache"
   [(set (mem:BLK (scratch))
 	(unspec:BLK [(match_operand:SI 0 "const_int_operand")
-		     (match_operand:QI 1 "address_operand" "p")]
+		     (match_operand:QI 1 "address_operand" "ZD")]
 		    UNSPEC_MIPS_CACHE))]
   "ISA_HAS_CACHE"
   "cache\t%X0,%a1")
@@ -5348,6 +5600,16 @@
    (set_attr "mode" "SI")
    (set_attr "extended_mips16" "no,no,yes")])
 
+(define_insn "<GPR:d>lsa"
+ [(set (match_operand:GPR 0 "register_operand" "=d")
+       (plus:GPR (mult:GPR (match_operand:GPR 1 "register_operand" "d")
+			   (match_operand 2 "const_immlsa_operand" ""))
+		(match_operand:GPR 3 "register_operand" "d")))]
+ "ISA_HAS_<GPR:D>LSA"
+ "<GPR:d>lsa\t%0,%1,%3,%y2"
+ [(set_attr "type" "arith")
+  (set_attr "mode" "<GPR:MODE>")])
+
 ;; We need separate DImode MIPS16 patterns because of the irregularity
 ;; of right shifts.
 (define_insn "*ashldi3_mips16"
@@ -5506,11 +5768,11 @@
 
 ;; Conditional branches on floating-point equality tests.
 
-(define_insn "*branch_fp"
+(define_insn "*branch_fp_<mode>"
   [(set (pc)
         (if_then_else
          (match_operator 1 "equality_operator"
-                         [(match_operand:CC 2 "register_operand" "z")
+                         [(match_operand:FPCC 2 "register_operand" "<reg>")
 			  (const_int 0)])
          (label_ref (match_operand 0 "" ""))
          (pc)))]
@@ -5522,11 +5784,11 @@
 }
   [(set_attr "type" "branch")])
 
-(define_insn "*branch_fp_inverted"
+(define_insn "*branch_fp_inverted_<mode>"
   [(set (pc)
         (if_then_else
          (match_operator 1 "equality_operator"
-                         [(match_operand:CC 2 "register_operand" "z")
+                         [(match_operand:FPCC 2 "register_operand" "<reg>")
 			  (const_int 0)])
          (pc)
          (label_ref (match_operand 0 "" ""))))]
@@ -5544,25 +5806,29 @@
   [(set (pc)
 	(if_then_else
 	 (match_operator 1 "order_operator"
-			 [(match_operand:GPR 2 "register_operand" "d")
-			  (const_int 0)])
+			 [(match_operand:GPR 2 "register_operand" "d,d")
+			  (match_operand:GPR 3 "reg_or_0_operand" "J,d")])
 	 (label_ref (match_operand 0 "" ""))
 	 (pc)))]
   "!TARGET_MIPS16"
   { return mips_output_order_conditional_branch (insn, operands, false); }
-  [(set_attr "type" "branch")])
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe,always")
+   (set_attr "hazard" "forbidden_slot")])
 
 (define_insn "*branch_order<mode>_inverted"
   [(set (pc)
 	(if_then_else
 	 (match_operator 1 "order_operator"
-			 [(match_operand:GPR 2 "register_operand" "d")
-			  (const_int 0)])
+			 [(match_operand:GPR 2 "register_operand" "d,d")
+			  (match_operand:GPR 3 "reg_or_0_operand" "J,d")])
 	 (pc)
 	 (label_ref (match_operand 0 "" ""))))]
   "!TARGET_MIPS16"
   { return mips_output_order_conditional_branch (insn, operands, true); }
-  [(set_attr "type" "branch")])
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe,always")
+   (set_attr "hazard" "forbidden_slot")])
 
 ;; Conditional branch on equality comparison.
 
@@ -5575,20 +5841,10 @@
 	 (label_ref (match_operand 0 "" ""))
 	 (pc)))]
   "!TARGET_MIPS16"
-{
-  /* For a simple BNEZ or BEQZ microMIPS branch.  */
-  if (TARGET_MICROMIPS
-      && operands[3] == const0_rtx
-      && get_attr_length (insn) <= 8)
-    return mips_output_conditional_branch (insn, operands,
-					   "%*b%C1z%:\t%2,%0",
-					   "%*b%N1z%:\t%2,%0");
-
-  return mips_output_conditional_branch (insn, operands,
-					 MIPS_BRANCH ("b%C1", "%2,%z3,%0"),
-					 MIPS_BRANCH ("b%N1", "%2,%z3,%0"));
-}
-  [(set_attr "type" "branch")])
+  { return mips_output_equal_conditional_branch (insn, operands, false); }
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe")
+   (set_attr "hazard" "forbidden_slot")])
 
 (define_insn "*branch_equality<mode>_inverted"
   [(set (pc)
@@ -5599,20 +5855,10 @@
 	 (pc)
 	 (label_ref (match_operand 0 "" ""))))]
   "!TARGET_MIPS16"
-{
-  /* For a simple BNEZ or BEQZ microMIPS branch.  */
-  if (TARGET_MICROMIPS
-      && operands[3] == const0_rtx
-      && get_attr_length (insn) <= 8)
-    return mips_output_conditional_branch (insn, operands,
-					   "%*b%N0z%:\t%2,%1",
-					   "%*b%C0z%:\t%2,%1");
-
-  return mips_output_conditional_branch (insn, operands,
-					 MIPS_BRANCH ("b%N1", "%2,%z3,%0"),
-					 MIPS_BRANCH ("b%C1", "%2,%z3,%0"));
-}
-  [(set_attr "type" "branch")])
+  { return mips_output_equal_conditional_branch (insn, operands, true); }
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe")
+   (set_attr "hazard" "forbidden_slot")])
 
 ;; MIPS16 branches
 
@@ -5870,21 +6116,21 @@
 ;;
 ;;  ....................
 
-(define_insn "s<code>_<mode>"
-  [(set (match_operand:CC 0 "register_operand" "=z")
-	(fcond:CC (match_operand:SCALARF 1 "register_operand" "f")
-		  (match_operand:SCALARF 2 "register_operand" "f")))]
+(define_insn "s<code>_<SCALARF:mode>_using_<FPCC:mode>"
+  [(set (match_operand:FPCC 0 "register_operand" "=<reg>")
+	(fcond:FPCC (match_operand:SCALARF 1 "register_operand" "f")
+		    (match_operand:SCALARF 2 "register_operand" "f")))]
   ""
-  "c.<fcond>.<fmt>\t%Z0%1,%2"
+  "<fpcmp>.<fcond>.<fmt>\t%Z0%1,%2"
   [(set_attr "type" "fcmp")
    (set_attr "mode" "FPSW")])
 
-(define_insn "s<code>_<mode>"
-  [(set (match_operand:CC 0 "register_operand" "=z")
-	(swapped_fcond:CC (match_operand:SCALARF 1 "register_operand" "f")
-		          (match_operand:SCALARF 2 "register_operand" "f")))]
+(define_insn "s<code>_<SCALARF:mode>_using_<FPCC:mode>"
+  [(set (match_operand:FPCC 0 "register_operand" "=<reg>")
+	(swapped_fcond:FPCC (match_operand:SCALARF 1 "register_operand" "f")
+			    (match_operand:SCALARF 2 "register_operand" "f")))]
   ""
-  "c.<swapped_fcond>.<fmt>\t%Z0%2,%1"
+  "<fpcmp>.<swapped_fcond>.<fmt>\t%Z0%2,%1"
   [(set_attr "type" "fcmp")
    (set_attr "mode" "FPSW")])
 
@@ -5906,14 +6152,23 @@
 	(label_ref (match_operand 0)))]
   "!TARGET_MIPS16 && TARGET_ABSOLUTE_JUMPS"
 {
-  /* Use a branch for microMIPS.  The assembler will choose
-     a 16-bit branch, a 32-bit branch, or a 32-bit jump.  */
-  if (TARGET_MICROMIPS && !TARGET_ABICALLS_PIC2)
-    return "%*b\t%l0%/";
+  if (get_attr_length (insn) <= 8)
+    {
+      if (TARGET_CB_MAYBE)
+	return MIPS_ABSOLUTE_JUMP ("%*b%:\t%l0");
+      else
+	return MIPS_ABSOLUTE_JUMP ("%*b\t%l0%/");
+    }
   else
-    return MIPS_ABSOLUTE_JUMP ("%*j\t%l0%/");
+    {
+      if (TARGET_CB_MAYBE && !final_sequence)
+	return MIPS_ABSOLUTE_JUMP ("%*bc\t%l0");
+      else
+	return MIPS_ABSOLUTE_JUMP ("%*j\t%l0%/");
+    }
 }
-  [(set_attr "type" "jump")])
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe")])
 
 (define_insn "*jump_pic"
   [(set (pc)
@@ -5921,14 +6176,23 @@
   "!TARGET_MIPS16 && !TARGET_ABSOLUTE_JUMPS"
 {
   if (get_attr_length (insn) <= 8)
-    return "%*b\t%l0%/";
+    {
+      if (TARGET_CB_MAYBE)
+	return "%*b%:\t%l0";
+      else
+	return "%*b\t%l0%/";
+    }
   else
     {
       mips_output_load_label (operands[0]);
-      return "%*jr\t%@%/%]";
+      if (TARGET_CB_MAYBE)
+	return "%*jr%:\t%@%]";
+      else
+	return "%*jr\t%@%/%]";
     }
 }
-  [(set_attr "type" "branch")])
+  [(set_attr "type" "branch")
+   (set_attr "compact_form" "maybe")])
 
 ;; We need a different insn for the mips16, because a mips16 branch
 ;; does not have a delay slot.
@@ -5975,12 +6239,9 @@
 (define_insn "indirect_jump_<mode>"
   [(set (pc) (match_operand:P 0 "register_operand" "d"))]
   ""
-{
-  if (TARGET_MICROMIPS)
-    return "%*jr%:\t%0";
-  else
-    return "%*j\t%0%/";
-}
+  {
+    return mips_output_jump (operands, 0, -1, false);
+  }
   [(set_attr "type" "jump")
    (set_attr "mode" "none")])
 
@@ -6024,12 +6285,9 @@
 	(match_operand:P 0 "register_operand" "d"))
    (use (label_ref (match_operand 1 "" "")))]
   ""
-{
-  if (TARGET_MICROMIPS)
-    return "%*jr%:\t%0";
-  else
-    return "%*j\t%0%/";
-}
+  {
+    return mips_output_jump (operands, 0, -1, false);
+  }
   [(set_attr "type" "jump")
    (set_attr "mode" "none")])
 
@@ -6091,10 +6349,10 @@
   rtx diff_vec = PATTERN (NEXT_INSN (operands[2]));
 
   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
-  
+
   output_asm_insn ("sltu\t%0, %1", operands);
   output_asm_insn ("bteqz\t%3", operands);
-  
+
   switch (GET_MODE (diff_vec))
     {
     case HImode:
@@ -6241,10 +6499,8 @@
   [(any_return)]
   ""
   {
-    if (TARGET_MICROMIPS)
-      return "%*jr%:\t$31";
-    else
-      return "%*j\t$31%/";
+    operands[0] = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+    return mips_output_jump (operands, 0, -1, false);
   }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")])
@@ -6255,12 +6511,10 @@
   [(any_return)
    (use (match_operand 0 "pmode_register_operand" ""))]
   ""
-{
-  if (TARGET_MICROMIPS)
-    return "%*jr%:\t%0";
-  else
-    return "%*j\t%0%/";
-}
+  {
+    operands[0] = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
+    return mips_output_jump (operands, 0, -1, false);
+  }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")])
 
@@ -6516,12 +6770,7 @@
   [(call (mem:SI (match_operand 0 "call_insn_operand" "j,S"))
 	 (match_operand 1 "" ""))]
   "TARGET_SIBCALLS && SIBLING_CALL_P (insn)"
-{
-  if (TARGET_MICROMIPS)
-    return MICROMIPS_J ("j", operands, 0);
-  else
-    return MIPS_CALL ("j", operands, 0, 1);
-}
+  { return mips_output_jump (operands, 0, 1, false); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6542,12 +6791,7 @@
         (call (mem:SI (match_operand 1 "call_insn_operand" "j,S"))
               (match_operand 2 "" "")))]
   "TARGET_SIBCALLS && SIBLING_CALL_P (insn)"
-{
-  if (TARGET_MICROMIPS)
-    return MICROMIPS_J ("j", operands, 1);
-  else
-    return MIPS_CALL ("j", operands, 1, 2);
-}
+  { return mips_output_jump (operands, 1, 2, false); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6559,12 +6803,7 @@
 	(call (mem:SI (match_dup 1))
 	      (match_dup 2)))]
   "TARGET_SIBCALLS && SIBLING_CALL_P (insn)"
-{
-  if (TARGET_MICROMIPS)
-    return MICROMIPS_J ("j", operands, 1);
-  else
-    return MIPS_CALL ("j", operands, 1, 2);
-}
+  { return mips_output_jump (operands, 1, 2, false); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6620,7 +6859,10 @@
 	 (match_operand 1 "" ""))
    (clobber (reg:SI RETURN_ADDR_REGNUM))]
   ""
-  { return TARGET_SPLIT_CALLS ? "#" : MIPS_CALL ("jal", operands, 0, 1); }
+  {
+    return (TARGET_SPLIT_CALLS ? "#"
+	    : mips_output_jump (operands, 0, 1, true));
+  }
   "reload_completed && TARGET_SPLIT_CALLS"
   [(const_int 0)]
 {
@@ -6635,7 +6877,7 @@
    (clobber (reg:SI RETURN_ADDR_REGNUM))
    (clobber (reg:SI 28))]
   "TARGET_SPLIT_CALLS"
-  { return MIPS_CALL ("jal", operands, 0, 1); }
+  { return mips_output_jump (operands, 0, 1, true); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6649,7 +6891,10 @@
    (const_int 1)
    (clobber (reg:SI RETURN_ADDR_REGNUM))]
   ""
-  { return TARGET_SPLIT_CALLS ? "#" : MIPS_CALL ("jal", operands, 0, -1); }
+  {
+    return (TARGET_SPLIT_CALLS ? "#"
+	    : mips_output_jump (operands, 0, -1, true));
+  }
   "reload_completed && TARGET_SPLIT_CALLS"
   [(const_int 0)]
 {
@@ -6666,7 +6911,7 @@
    (clobber (reg:SI RETURN_ADDR_REGNUM))
    (clobber (reg:SI 28))]
   "TARGET_SPLIT_CALLS"
-  { return MIPS_CALL ("jal", operands, 0, -1); }
+  { return mips_output_jump (operands, 0, -1, true); }
   [(set_attr "jal" "direct")
    (set_attr "jal_macro" "no")])
 
@@ -6689,7 +6934,10 @@
               (match_operand 2 "" "")))
    (clobber (reg:SI RETURN_ADDR_REGNUM))]
   ""
-  { return TARGET_SPLIT_CALLS ? "#" : MIPS_CALL ("jal", operands, 1, 2); }
+  {
+    return (TARGET_SPLIT_CALLS ? "#"
+	    : mips_output_jump (operands, 1, 2, true));
+  }
   "reload_completed && TARGET_SPLIT_CALLS"
   [(const_int 0)]
 {
@@ -6707,7 +6955,7 @@
    (clobber (reg:SI RETURN_ADDR_REGNUM))
    (clobber (reg:SI 28))]
   "TARGET_SPLIT_CALLS"
-  { return MIPS_CALL ("jal", operands, 1, 2); }
+  { return mips_output_jump (operands, 1, 2, true); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6719,7 +6967,10 @@
    (const_int 1)
    (clobber (reg:SI RETURN_ADDR_REGNUM))]
   ""
-  { return TARGET_SPLIT_CALLS ? "#" : MIPS_CALL ("jal", operands, 1, -1); }
+  {
+    return (TARGET_SPLIT_CALLS ? "#"
+	    : mips_output_jump (operands, 1, -1, true));
+  }
   "reload_completed && TARGET_SPLIT_CALLS"
   [(const_int 0)]
 {
@@ -6738,7 +6989,7 @@
    (clobber (reg:SI RETURN_ADDR_REGNUM))
    (clobber (reg:SI 28))]
   "TARGET_SPLIT_CALLS"
-  { return MIPS_CALL ("jal", operands, 1, -1); }
+  { return mips_output_jump (operands, 1, -1, true); }
   [(set_attr "jal" "direct")
    (set_attr "jal_macro" "no")])
 
@@ -6752,7 +7003,10 @@
 	      (match_dup 2)))
    (clobber (reg:SI RETURN_ADDR_REGNUM))]
   ""
-  { return TARGET_SPLIT_CALLS ? "#" : MIPS_CALL ("jal", operands, 1, 2); }
+  {
+    return (TARGET_SPLIT_CALLS ? "#"
+	    : mips_output_jump (operands, 1, 2, true));
+  }
   "reload_completed && TARGET_SPLIT_CALLS"
   [(const_int 0)]
 {
@@ -6773,7 +7027,7 @@
    (clobber (reg:SI RETURN_ADDR_REGNUM))
    (clobber (reg:SI 28))]
   "TARGET_SPLIT_CALLS"
-  { return MIPS_CALL ("jal", operands, 1, 2); }
+  { return mips_output_jump (operands, 1, 2, true); }
   [(set_attr "jal" "indirect,direct")
    (set_attr "jal_macro" "no")])
 
@@ -6904,6 +7158,41 @@
   [(set_attr "type" "condmove")
    (set_attr "mode" "<SCALARF:MODE>")])
 
+(define_insn "*sel<code><GPR:mode>_using_<GPR2:mode>"
+  [(set (match_operand:GPR 0 "register_operand" "=d,d")
+	(if_then_else:GPR
+	 (equality_op:GPR2 (match_operand:GPR2 1 "register_operand" "d,d")
+			   (const_int 0))
+	 (match_operand:GPR 2 "reg_or_0_operand" "d,J")
+	 (match_operand:GPR 3 "reg_or_0_operand" "J,d")))]
+  "ISA_HAS_SEL
+   && (register_operand (operands[2], <GPR:MODE>mode)
+       != register_operand (operands[3], <GPR:MODE>mode))"
+  "@
+   <sel>\t%0,%2,%1
+   <selinv>\t%0,%3,%1"
+  [(set_attr "type" "condmove")
+   (set_attr "mode" "<GPR:MODE>")])
+
+;; sel.fmt copies the 3rd argument when the 1st is non-zero and the 2nd
+;; argument if the 1st is zero.  This means operand 2 and 3 are
+;; inverted in the instruction.
+
+(define_insn "*sel<mode>"
+  [(set (match_operand:SCALARF 0 "register_operand" "=f,f,f")
+	(if_then_else:SCALARF
+	 (ne:CCF (match_operand:CCF 1 "register_operand" "0,f,f")
+		 (const_int 0))
+	 (match_operand:SCALARF 2 "reg_or_0_operand" "f,G,f")
+	 (match_operand:SCALARF 3 "reg_or_0_operand" "f,f,G")))]
+  "ISA_HAS_SEL && ISA_HAS_CCF"
+  "@
+   sel.<fmt>\t%0,%3,%2
+   seleqz.<fmt>\t%0,%3,%1
+   selnez.<fmt>\t%0,%2,%1"
+  [(set_attr "type" "condmove")
+   (set_attr "mode" "<SCALARF:MODE>")])
+
 ;; These are the main define_expand's used to make conditional moves.
 
 (define_expand "mov<mode>cc"
@@ -6912,8 +7201,11 @@
 	(if_then_else:GPR (match_dup 5)
 			  (match_operand:GPR 2 "reg_or_0_operand")
 			  (match_operand:GPR 3 "reg_or_0_operand")))]
-  "ISA_HAS_CONDMOVE"
+  "ISA_HAS_CONDMOVE || ISA_HAS_SEL"
 {
+  if (ISA_HAS_SEL && !INTEGRAL_MODE_P (GET_MODE (XEXP (operands[1], 0))))
+    FAIL;
+
   mips_expand_conditional_move (operands);
   DONE;
 })
@@ -6922,10 +7214,25 @@
   [(set (match_dup 4) (match_operand 1 "comparison_operator"))
    (set (match_operand:SCALARF 0 "register_operand")
 	(if_then_else:SCALARF (match_dup 5)
-			      (match_operand:SCALARF 2 "register_operand")
-			      (match_operand:SCALARF 3 "register_operand")))]
-  "ISA_HAS_FP_CONDMOVE"
+			      (match_operand:SCALARF 2 "reg_or_0_operand")
+			      (match_operand:SCALARF 3 "reg_or_0_operand")))]
+  "ISA_HAS_FP_CONDMOVE
+   || (ISA_HAS_SEL && ISA_HAS_CCF)"
 {
+  if (ISA_HAS_SEL && !FLOAT_MODE_P (GET_MODE (XEXP (operands[1], 0))))
+    FAIL;
+
+  /* Workaround an LRA bug which means that tied operands in the sel.fmt
+     pattern lead to the double precision destination of sel.d getting
+     reloaded with the full register file usable and the restrictions on
+     whether the CCFmode input can be used in odd-numbered single-precision
+     registers are ignored.  For consistency reasons the CCF mode values
+     must be guaranteed to only exist in the even-registers because of
+     the unusual duality between single and double precision values.  */
+  if (ISA_HAS_SEL && <MODE>mode == DFmode
+      && (!TARGET_ODD_SPREG || TARGET_FLOATXX))
+    FAIL;
+
   mips_expand_conditional_move (operands);
   DONE;
 })
@@ -7040,7 +7347,12 @@
   [(set (reg:P TLS_GET_TP_REGNUM)
 	(unspec:P [(const_int 0)] UNSPEC_TLS_GET_TP))]
   "HAVE_AS_TLS && !TARGET_MIPS16"
-  ".set\tpush\;.set\tmips32r2\t\;rdhwr\t$3,$29\;.set\tpop"
+  {
+    if (mips_isa_rev >= 2)
+      return "rdhwr\t$3,$29";
+
+    return ".set\tpush\;.set\tmips32r2\t\;rdhwr\t$3,$29\;.set\tpop";
+  }
   [(set_attr "type" "unknown")
    ; Since rdhwr always generates a trap for now, putting it in a delay
    ; slot would make the kernel's emulation of it much slower.
@@ -7085,7 +7397,7 @@
    (clobber (reg:P PIC_FUNCTION_ADDR_REGNUM))
    (clobber (reg:P RETURN_ADDR_REGNUM))]
   "HAVE_AS_TLS && TARGET_MIPS16"
-  { return MIPS_CALL ("jal", operands, 0, -1); }
+  { return mips_output_jump (operands, 0, -1, true); }
   [(set_attr "type" "call")
    (set_attr "insn_count" "3")
    (set_attr "mode" "<MODE>")])
@@ -7126,7 +7438,7 @@
    (clobber (reg:P PIC_FUNCTION_ADDR_REGNUM))
    (clobber (reg:P RETURN_ADDR_REGNUM))]
   "TARGET_HARD_FLOAT_ABI && TARGET_MIPS16"
-  { return MIPS_CALL ("jal", operands, 0, -1); }
+  { return mips_output_jump (operands, 0, -1, true); }
   [(set_attr "type" "call")
    (set_attr "insn_count" "3")])
 
@@ -7156,9 +7468,101 @@
    (clobber (reg:P PIC_FUNCTION_ADDR_REGNUM))
    (clobber (reg:P RETURN_ADDR_REGNUM))]
   "TARGET_HARD_FLOAT_ABI && TARGET_MIPS16"
-  { return MIPS_CALL ("jal", operands, 0, -1); }
+  { return mips_output_jump (operands, 0, -1, true); }
   [(set_attr "type" "call")
    (set_attr "insn_count" "3")])
+
+(define_insn "*join2_load_store<JOIN_MODE:mode>"
+  [(set (match_operand:JOIN_MODE 0 "nonimmediate_operand" "=d,f,m,m")
+	(match_operand:JOIN_MODE 1 "nonimmediate_operand" "m,m,d,f"))
+   (set (match_operand:JOIN_MODE 2 "nonimmediate_operand" "=d,f,m,m")
+	(match_operand:JOIN_MODE 3 "nonimmediate_operand" "m,m,d,f"))]
+  "ENABLE_LD_ST_PAIRS && reload_completed"
+  {
+    bool load_p = (which_alternative == 0 || which_alternative == 1);
+    if (!load_p || !reg_overlap_mentioned_p (operands[0], operands[1]))
+      {
+	output_asm_insn (mips_output_move (operands[0], operands[1]), operands);
+	output_asm_insn (mips_output_move (operands[2], operands[3]), &operands[2]);
+      }
+    else
+      {
+	output_asm_insn (mips_output_move (operands[2], operands[3]), &operands[2]);
+	output_asm_insn (mips_output_move (operands[0], operands[1]), operands);
+      }
+    return "";
+  }
+  [(set_attr "move_type" "load,fpload,store,fpstore")
+   (set_attr "insn_count" "2,2,2,2")])
+
+;; 2 HI/SI/SF/DF loads are joined.
+;; P5600 does not support bonding of two LBs, hence QI mode is not included.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "register_operand")
+	(match_operand:JOIN_MODE 1 "non_volatile_mem_operand"))
+   (set (match_operand:JOIN_MODE 2 "register_operand")
+	(match_operand:JOIN_MODE 3 "non_volatile_mem_operand"))]
+  "ENABLE_LD_ST_PAIRS && 
+   mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, true)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+;; 2 HI/SI/SF/DF stores are joined.
+;; P5600 does not support bonding of two SBs, hence QI mode is not included.
+(define_peephole2
+  [(set (match_operand:JOIN_MODE 0 "memory_operand")
+	(match_operand:JOIN_MODE 1 "register_operand"))
+   (set (match_operand:JOIN_MODE 2 "memory_operand")
+	(match_operand:JOIN_MODE 3 "register_operand"))]
+  "ENABLE_LD_ST_PAIRS &&
+   mips_load_store_bonding_p (operands, <JOIN_MODE:MODE>mode, false)"
+  [(parallel [(set (match_dup 0)
+		   (match_dup 1))
+	      (set (match_dup 2)
+		   (match_dup 3))])]
+  "")
+
+(define_insn "*join2_loadhi"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(any_extend:SI (match_operand:HI 1 "memory_operand" "m")))
+   (set (match_operand:SI 2 "register_operand" "=r")
+	(any_extend:SI (match_operand:HI 3 "memory_operand" "m")))]
+  "ENABLE_LD_ST_PAIRS && reload_completed"
+  {
+    if (!reg_overlap_mentioned_p (operands[0], operands[1]))
+      {
+	output_asm_insn ("lh<u>\t%0,%1", operands);
+	output_asm_insn ("lh<u>\t%2,%3", operands);
+      }
+    else
+      {
+	output_asm_insn ("lh<u>\t%2,%3", operands);
+	output_asm_insn ("lh<u>\t%0,%1", operands);
+      }
+
+    return "";
+  }
+  [(set_attr "move_type" "load")
+   (set_attr "insn_count" "2")])
+
+
+;; 2 16 bit integer loads are joined.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+	(any_extend:SI (match_operand:HI 1 "non_volatile_mem_operand")))
+   (set (match_operand:SI 2 "register_operand")
+	(any_extend:SI (match_operand:HI 3 "non_volatile_mem_operand")))]
+  "ENABLE_LD_ST_PAIRS &&
+   mips_load_store_bonding_p (operands, HImode, true)"
+  [(parallel [(set (match_dup 0)
+		   (any_extend:SI (match_dup 1)))
+	      (set (match_dup 2)
+		   (any_extend:SI (match_dup 3)))])]
+  "")
+
 
 ;; Synchronization instructions.
 
@@ -7185,6 +7589,9 @@
 ; ST-Microelectronics Loongson-2E/2F-specific patterns.
 (include "loongson.md")
 
+; The MIPS MSA Instructions.
+(include "mips-msa.md")
+
 (define_c_enum "unspec" [
   UNSPEC_ADDRESS_FIRST
 ])
diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt
index dd8aff479af..88a08f5fcf2 100644
--- a/gcc/config/mips/mips.opt
+++ b/gcc/config/mips/mips.opt
@@ -123,6 +123,10 @@ mdspr2
 Target Report Var(TARGET_DSPR2)
 Use MIPS-DSP REV 2 instructions
 
+mdspr3
+Target Report Var(TARGET_DSPR3)
+Use MIPS-DSP Rev 3 instructions
+
 mdebug
 Target Var(TARGET_DEBUG_MODE) Undocumented
 
@@ -197,6 +201,10 @@ mfp32
 Target Report RejectNegative InverseMask(FLOAT64)
 Use 32-bit floating-point registers
 
+mfpxx
+Target Report RejectNegative Mask(FLOATXX)
+Conform to the o32 FPXX ABI
+
 mfp64
 Target Report RejectNegative Mask(FLOAT64)
 Use 64-bit floating-point registers
@@ -303,6 +311,10 @@ mmicromips
 Target Report Mask(MICROMIPS)
 Use microMIPS instructions
 
+mmsa
+Target Report Var(TARGET_MSA)
+Use MIPS MSA Extension instructions
+
 mmt
 Target Report Var(TARGET_MT)
 Allow the use of MT instructions
@@ -388,6 +400,10 @@ msynci
 Target Report Mask(SYNCI)
 Use synci instruction to invalidate i-cache
 
+mlra
+Target Report Var(mips_lra_flag) Init(1) Save
+Use LRA instead of reload
+
 mtune=
 Target RejectNegative Joined Var(mips_tune_option) ToLower Enum(mips_arch_opt_value)
 -mtune=PROCESSOR	Optimize the output for PROCESSOR
@@ -400,6 +416,10 @@ mvirt
 Target Report Var(TARGET_VIRT)
 Use Virtualization Application Specific instructions
 
+mxpa
+Target Report Var(TARGET_XPA)
+Use eXtended Physical Address (XPA) instructions
+
 mvr4130-align
 Target Report Mask(VR4130_ALIGN)
 Perform VR4130-specific alignment optimizations
@@ -408,5 +428,36 @@ mxgot
 Target Report Var(TARGET_XGOT)
 Lift restrictions on GOT size
 
+modd-spreg
+Target Report Mask(ODD_SPREG)
+Enable use of odd-numbered single-precision registers
+
 noasmopt
 Driver
+
+mload-store-pairs
+Target Report Var(TARGET_LOAD_STORE_PAIRS) Undocumented
+Enable load/store bonding.
+
+msched-weight
+Target Report Var(TARGET_SCHED_WEIGHT) Undocumented
+
+mcompact-branches=
+Target RejectNegative JoinedOrMissing Var(mips_cb) Report Enum(mips_cb_setting) Init(MIPS_CB_OPTIMAL)
+Specify the compact branch usage policy
+never	    Only use delay slot branches
+optimal	    Use compact branches where beneficial
+always	    Only use compact branches
+
+Enum
+Name(mips_cb_setting) Type(enum mips_cb_setting)
+Policies available for use with -mcompact-branches=
+
+EnumValue
+Enum(mips_cb_setting) String(never) Value(MIPS_CB_NEVER)
+
+EnumValue
+Enum(mips_cb_setting) String(optimal) Value(MIPS_CB_OPTIMAL)
+
+EnumValue
+Enum(mips_cb_setting) String(always) Value(MIPS_CB_ALWAYS)
diff --git a/gcc/config/mips/msa.h b/gcc/config/mips/msa.h
new file mode 100644
index 00000000000..fe2eaa1ac24
--- /dev/null
+++ b/gcc/config/mips/msa.h
@@ -0,0 +1,1121 @@
+/* MIPS MSA intrinsics include file.
+
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   Contributed by Imagination Technologies Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MSA_H
+#define _MSA_H 1
+
+#if defined(__mips_msa)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+#ifndef __clang__
+extern v16i8 __builtin_msa_sll_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_sll_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_sll_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_sll_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_slli_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_slli_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_slli_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_slli_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_sra_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_sra_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_sra_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_sra_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_srai_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_srai_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_srai_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_srai_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_srar_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_srar_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_srar_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_srar_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_srari_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_srari_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_srari_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_srari_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_srl_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_srl_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_srl_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_srl_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_srli_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_srli_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_srli_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_srli_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_srlr_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_srlr_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_srlr_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_srlr_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_srlri_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_srlri_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_srlri_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_srlri_d(v2i64, unsigned char);
+extern v16u8 __builtin_msa_bclr_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_bclr_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_bclr_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_bclr_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_bclri_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_bclri_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_bclri_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_bclri_d(v2u64, unsigned char);
+extern v16u8 __builtin_msa_bset_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_bset_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_bset_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_bset_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_bseti_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_bseti_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_bseti_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_bseti_d(v2u64, unsigned char);
+extern v16u8 __builtin_msa_bneg_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_bneg_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_bneg_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_bneg_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_bnegi_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_bnegi_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_bnegi_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_bnegi_d(v2u64, unsigned char);
+extern v16u8 __builtin_msa_binsl_b(v16u8, v16u8, v16u8);
+extern v8u16 __builtin_msa_binsl_h(v8u16, v8u16, v8u16);
+extern v4u32 __builtin_msa_binsl_w(v4u32, v4u32, v4u32);
+extern v2u64 __builtin_msa_binsl_d(v2u64, v2u64, v2u64);
+extern v16u8 __builtin_msa_binsli_b(v16u8, v16u8, unsigned char);
+extern v8u16 __builtin_msa_binsli_h(v8u16, v8u16, unsigned char);
+extern v4u32 __builtin_msa_binsli_w(v4u32, v4u32, unsigned char);
+extern v2u64 __builtin_msa_binsli_d(v2u64, v2u64, unsigned char);
+extern v16u8 __builtin_msa_binsr_b(v16u8, v16u8, v16u8);
+extern v8u16 __builtin_msa_binsr_h(v8u16, v8u16, v8u16);
+extern v4u32 __builtin_msa_binsr_w(v4u32, v4u32, v4u32);
+extern v2u64 __builtin_msa_binsr_d(v2u64, v2u64, v2u64);
+extern v16u8 __builtin_msa_binsri_b(v16u8, v16u8, unsigned char);
+extern v8u16 __builtin_msa_binsri_h(v8u16, v8u16, unsigned char);
+extern v4u32 __builtin_msa_binsri_w(v4u32, v4u32, unsigned char);
+extern v2u64 __builtin_msa_binsri_d(v2u64, v2u64, unsigned char);
+extern v16i8 __builtin_msa_addv_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_addv_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_addv_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_addv_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_addvi_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_addvi_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_addvi_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_addvi_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_subv_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_subv_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_subv_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_subv_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_subvi_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_subvi_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_subvi_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_subvi_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_max_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_max_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_max_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_max_s_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_maxi_s_b(v16i8, char);
+extern v8i16 __builtin_msa_maxi_s_h(v8i16, char);
+extern v4i32 __builtin_msa_maxi_s_w(v4i32, char);
+extern v2i64 __builtin_msa_maxi_s_d(v2i64, char);
+extern v16u8 __builtin_msa_max_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_max_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_max_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_max_u_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_maxi_u_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_maxi_u_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_maxi_u_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_maxi_u_d(v2u64, unsigned char);
+extern v16i8 __builtin_msa_min_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_min_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_min_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_min_s_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_mini_s_b(v16i8, char);
+extern v8i16 __builtin_msa_mini_s_h(v8i16, char);
+extern v4i32 __builtin_msa_mini_s_w(v4i32, char);
+extern v2i64 __builtin_msa_mini_s_d(v2i64, char);
+extern v16u8 __builtin_msa_min_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_min_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_min_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_min_u_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_mini_u_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_mini_u_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_mini_u_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_mini_u_d(v2u64, unsigned char);
+extern v16i8 __builtin_msa_max_a_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_max_a_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_max_a_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_max_a_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_min_a_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_min_a_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_min_a_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_min_a_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ceq_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ceq_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ceq_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ceq_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ceqi_b(v16i8, char);
+extern v8i16 __builtin_msa_ceqi_h(v8i16, char);
+extern v4i32 __builtin_msa_ceqi_w(v4i32, char);
+extern v2i64 __builtin_msa_ceqi_d(v2i64, char);
+extern v16i8 __builtin_msa_clt_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_clt_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_clt_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_clt_s_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_clti_s_b(v16i8, char);
+extern v8i16 __builtin_msa_clti_s_h(v8i16, char);
+extern v4i32 __builtin_msa_clti_s_w(v4i32, char);
+extern v2i64 __builtin_msa_clti_s_d(v2i64, char);
+extern v16i8 __builtin_msa_clt_u_b(v16u8, v16u8);
+extern v8i16 __builtin_msa_clt_u_h(v8u16, v8u16);
+extern v4i32 __builtin_msa_clt_u_w(v4u32, v4u32);
+extern v2i64 __builtin_msa_clt_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_clti_u_b(v16u8, unsigned char);
+extern v8i16 __builtin_msa_clti_u_h(v8u16, unsigned char);
+extern v4i32 __builtin_msa_clti_u_w(v4u32, unsigned char);
+extern v2i64 __builtin_msa_clti_u_d(v2u64, unsigned char);
+extern v16i8 __builtin_msa_cle_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_cle_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_cle_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_cle_s_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_clei_s_b(v16i8, char);
+extern v8i16 __builtin_msa_clei_s_h(v8i16, char);
+extern v4i32 __builtin_msa_clei_s_w(v4i32, char);
+extern v2i64 __builtin_msa_clei_s_d(v2i64, char);
+extern v16i8 __builtin_msa_cle_u_b(v16u8, v16u8);
+extern v8i16 __builtin_msa_cle_u_h(v8u16, v8u16);
+extern v4i32 __builtin_msa_cle_u_w(v4u32, v4u32);
+extern v2i64 __builtin_msa_cle_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_clei_u_b(v16u8, unsigned char);
+extern v8i16 __builtin_msa_clei_u_h(v8u16, unsigned char);
+extern v4i32 __builtin_msa_clei_u_w(v4u32, unsigned char);
+extern v2i64 __builtin_msa_clei_u_d(v2u64, unsigned char);
+extern v16i8 __builtin_msa_ld_b(void *, int);
+extern v8i16 __builtin_msa_ld_h(void *, int);
+extern v4i32 __builtin_msa_ld_w(void *, int);
+extern v2i64 __builtin_msa_ld_d(void *, int);
+extern void __builtin_msa_st_b(v16i8, char *, int);
+extern void __builtin_msa_st_h(v8i16, char *, int);
+extern void __builtin_msa_st_w(v4i32, char *, int);
+extern void __builtin_msa_st_d(v2i64, char *, int);
+extern v16i8 __builtin_msa_sat_s_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_sat_s_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_sat_s_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_sat_s_d(v2i64, unsigned char);
+extern v16u8 __builtin_msa_sat_u_b(v16u8, unsigned char);
+extern v8u16 __builtin_msa_sat_u_h(v8u16, unsigned char);
+extern v4u32 __builtin_msa_sat_u_w(v4u32, unsigned char);
+extern v2u64 __builtin_msa_sat_u_d(v2u64, unsigned char);
+extern v16i8 __builtin_msa_add_a_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_add_a_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_add_a_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_add_a_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_adds_a_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_adds_a_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_adds_a_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_adds_a_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_adds_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_adds_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_adds_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_adds_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_adds_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_adds_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_adds_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_adds_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_ave_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ave_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ave_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ave_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_ave_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_ave_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_ave_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_ave_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_aver_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_aver_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_aver_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_aver_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_aver_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_aver_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_aver_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_aver_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_subs_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_subs_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_subs_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_subs_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_subs_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_subs_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_subs_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_subs_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_subsuu_s_b(v16u8, v16u8);
+extern v8i16 __builtin_msa_subsuu_s_h(v8u16, v8u16);
+extern v4i32 __builtin_msa_subsuu_s_w(v4u32, v4u32);
+extern v2i64 __builtin_msa_subsuu_s_d(v2u64, v2u64);
+extern v16u8 __builtin_msa_subsus_u_b(v16u8, v16i8);
+extern v8u16 __builtin_msa_subsus_u_h(v8u16, v8i16);
+extern v4u32 __builtin_msa_subsus_u_w(v4u32, v4i32);
+extern v2u64 __builtin_msa_subsus_u_d(v2u64, v2i64);
+extern v16i8 __builtin_msa_asub_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_asub_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_asub_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_asub_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_asub_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_asub_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_asub_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_asub_u_d(v2u64, v2u64);
+extern v16i8 __builtin_msa_mulv_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_mulv_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_mulv_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_mulv_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_maddv_b(v16i8, v16i8, v16i8);
+extern v8i16 __builtin_msa_maddv_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_maddv_w(v4i32, v4i32, v4i32);
+extern v2i64 __builtin_msa_maddv_d(v2i64, v2i64, v2i64);
+extern v16i8 __builtin_msa_msubv_b(v16i8, v16i8, v16i8);
+extern v8i16 __builtin_msa_msubv_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_msubv_w(v4i32, v4i32, v4i32);
+extern v2i64 __builtin_msa_msubv_d(v2i64, v2i64, v2i64);
+extern v16i8 __builtin_msa_div_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_div_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_div_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_div_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_div_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_div_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_div_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_div_u_d(v2u64, v2u64);
+extern v8i16 __builtin_msa_hadd_s_h(v16i8, v16i8);
+extern v4i32 __builtin_msa_hadd_s_w(v8i16, v8i16);
+extern v2i64 __builtin_msa_hadd_s_d(v4i32, v4i32);
+extern v8u16 __builtin_msa_hadd_u_h(v16u8, v16u8);
+extern v4u32 __builtin_msa_hadd_u_w(v8u16, v8u16);
+extern v2u64 __builtin_msa_hadd_u_d(v4u32, v4u32);
+extern v8i16 __builtin_msa_hsub_s_h(v16i8, v16i8);
+extern v4i32 __builtin_msa_hsub_s_w(v8i16, v8i16);
+extern v2i64 __builtin_msa_hsub_s_d(v4i32, v4i32);
+extern v8i16 __builtin_msa_hsub_u_h(v16u8, v16u8);
+extern v4i32 __builtin_msa_hsub_u_w(v8u16, v8u16);
+extern v2i64 __builtin_msa_hsub_u_d(v4u32, v4u32);
+extern v16i8 __builtin_msa_mod_s_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_mod_s_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_mod_s_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_mod_s_d(v2i64, v2i64);
+extern v16u8 __builtin_msa_mod_u_b(v16u8, v16u8);
+extern v8u16 __builtin_msa_mod_u_h(v8u16, v8u16);
+extern v4u32 __builtin_msa_mod_u_w(v4u32, v4u32);
+extern v2u64 __builtin_msa_mod_u_d(v2u64, v2u64);
+extern v8i16 __builtin_msa_dotp_s_h(v16i8, v16i8);
+extern v4i32 __builtin_msa_dotp_s_w(v8i16, v8i16);
+extern v2i64 __builtin_msa_dotp_s_d(v4i32, v4i32);
+extern v8u16 __builtin_msa_dotp_u_h(v16u8, v16u8);
+extern v4u32 __builtin_msa_dotp_u_w(v8u16, v8u16);
+extern v2u64 __builtin_msa_dotp_u_d(v4u32, v4u32);
+extern v8i16 __builtin_msa_dpadd_s_h(v8i16, v16i8, v16i8);
+extern v4i32 __builtin_msa_dpadd_s_w(v4i32, v8i16, v8i16);
+extern v2i64 __builtin_msa_dpadd_s_d(v2i64, v4i32, v4i32);
+extern v8u16 __builtin_msa_dpadd_u_h(v8u16, v16u8, v16u8);
+extern v4u32 __builtin_msa_dpadd_u_w(v4u32, v8u16, v8u16);
+extern v2u64 __builtin_msa_dpadd_u_d(v2u64, v4u32, v4u32);
+extern v8i16 __builtin_msa_dpsub_s_h(v8i16, v16i8, v16i8);
+extern v4i32 __builtin_msa_dpsub_s_w(v4i32, v8i16, v8i16);
+extern v2i64 __builtin_msa_dpsub_s_d(v2i64, v4i32, v4i32);
+extern v8i16 __builtin_msa_dpsub_u_h(v8i16, v16u8, v16u8);
+extern v4i32 __builtin_msa_dpsub_u_w(v4i32, v8u16, v8u16);
+extern v2i64 __builtin_msa_dpsub_u_d(v2i64, v4u32, v4u32);
+extern v16i8 __builtin_msa_sld_b(v16i8, v16i8, int);
+extern v8i16 __builtin_msa_sld_h(v8i16, v8i16, int);
+extern v4i32 __builtin_msa_sld_w(v4i32, v4i32, int);
+extern v2i64 __builtin_msa_sld_d(v2i64, v2i64, int);
+extern v16i8 __builtin_msa_sldi_b(v16i8, v16i8, unsigned char);
+extern v8i16 __builtin_msa_sldi_h(v8i16, v8i16, unsigned char);
+extern v4i32 __builtin_msa_sldi_w(v4i32, v4i32, unsigned char);
+extern v2i64 __builtin_msa_sldi_d(v2i64, v2i64, unsigned char);
+extern v16i8 __builtin_msa_splat_b(v16i8, int);
+extern v8i16 __builtin_msa_splat_h(v8i16, int);
+extern v4i32 __builtin_msa_splat_w(v4i32, int);
+extern v2i64 __builtin_msa_splat_d(v2i64, int);
+extern v16i8 __builtin_msa_splati_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_splati_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_splati_w(v4i32, unsigned char);
+extern v2i64 __builtin_msa_splati_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_pckev_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_pckev_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_pckev_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_pckev_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_pckod_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_pckod_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_pckod_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_pckod_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ilvl_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ilvl_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ilvl_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ilvl_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ilvr_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ilvr_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ilvr_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ilvr_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ilvev_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ilvev_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ilvev_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ilvev_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_ilvod_b(v16i8, v16i8);
+extern v8i16 __builtin_msa_ilvod_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_ilvod_w(v4i32, v4i32);
+extern v2i64 __builtin_msa_ilvod_d(v2i64, v2i64);
+extern v16i8 __builtin_msa_vshf_b(v16i8, v16i8, v16i8);
+extern v8i16 __builtin_msa_vshf_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_vshf_w(v4i32, v4i32, v4i32);
+extern v2i64 __builtin_msa_vshf_d(v2i64, v2i64, v2i64);
+extern v16u8 __builtin_msa_and_v(v16u8, v16u8);
+extern v16u8 __builtin_msa_andi_b(v16u8, unsigned char);
+extern v16u8 __builtin_msa_or_v(v16u8, v16u8);
+extern v16u8 __builtin_msa_ori_b(v16u8, unsigned char);
+extern v16u8 __builtin_msa_nor_v(v16u8, v16u8);
+extern v16u8 __builtin_msa_nori_b(v16u8, unsigned char);
+extern v16u8 __builtin_msa_xor_v(v16u8, v16u8);
+extern v16u8 __builtin_msa_xori_b(v16u8, unsigned char);
+extern v16u8 __builtin_msa_bmnz_v(v16u8, v16u8, v16u8);
+extern v16u8 __builtin_msa_bmnzi_b(v16u8, v16u8, unsigned char);
+extern v16u8 __builtin_msa_bmz_v(v16u8, v16u8, v16u8);
+extern v16u8 __builtin_msa_bmzi_b(v16u8, v16u8, unsigned char);
+extern v16u8 __builtin_msa_bsel_v(v16u8, v16u8, v16u8);
+extern v16u8 __builtin_msa_bseli_b(v16u8, v16u8, unsigned char);
+extern v16i8 __builtin_msa_shf_b(v16i8, unsigned char);
+extern v8i16 __builtin_msa_shf_h(v8i16, unsigned char);
+extern v4i32 __builtin_msa_shf_w(v4i32, unsigned char);
+extern int __builtin_msa_bnz_v(v16u8);
+extern int __builtin_msa_bz_v(v16u8);
+extern v16i8 __builtin_msa_fill_b(int);
+extern v8i16 __builtin_msa_fill_h(int);
+extern v4i32 __builtin_msa_fill_w(int);
+extern v2i64 __builtin_msa_fill_d(long long);
+extern v16i8 __builtin_msa_pcnt_b(v16i8);
+extern v8i16 __builtin_msa_pcnt_h(v8i16);
+extern v4i32 __builtin_msa_pcnt_w(v4i32);
+extern v2i64 __builtin_msa_pcnt_d(v2i64);
+extern v16i8 __builtin_msa_nloc_b(v16i8);
+extern v8i16 __builtin_msa_nloc_h(v8i16);
+extern v4i32 __builtin_msa_nloc_w(v4i32);
+extern v2i64 __builtin_msa_nloc_d(v2i64);
+extern v16i8 __builtin_msa_nlzc_b(v16i8);
+extern v8i16 __builtin_msa_nlzc_h(v8i16);
+extern v4i32 __builtin_msa_nlzc_w(v4i32);
+extern v2i64 __builtin_msa_nlzc_d(v2i64);
+extern int __builtin_msa_copy_s_b(v16i8, unsigned char);
+extern int __builtin_msa_copy_s_h(v8i16, unsigned char);
+extern int __builtin_msa_copy_s_w(v4i32, unsigned char);
+extern long long __builtin_msa_copy_s_d(v2i64, unsigned char);
+extern int __builtin_msa_copy_u_b(v16i8, unsigned char);
+extern int __builtin_msa_copy_u_h(v8i16, unsigned char);
+extern int __builtin_msa_copy_u_w(v4i32, unsigned char);
+extern long long __builtin_msa_copy_u_d(v2i64, unsigned char);
+extern v16i8 __builtin_msa_insert_b(v16i8, unsigned char, int);
+extern v8i16 __builtin_msa_insert_h(v8i16, unsigned char, int);
+extern v4i32 __builtin_msa_insert_w(v4i32, unsigned char, int);
+extern v2i64 __builtin_msa_insert_d(v2i64, unsigned char, long long);
+extern v16i8 __builtin_msa_insve_b(v16i8, unsigned char, v16i8);
+extern v8i16 __builtin_msa_insve_h(v8i16, unsigned char, v8i16);
+extern v4i32 __builtin_msa_insve_w(v4i32, unsigned char, v4i32);
+extern v2i64 __builtin_msa_insve_d(v2i64, unsigned char, v2i64);
+extern int __builtin_msa_bnz_b(v16u8);
+extern int __builtin_msa_bnz_h(v8u16);
+extern int __builtin_msa_bnz_w(v4u32);
+extern int __builtin_msa_bnz_d(v2u64);
+extern int __builtin_msa_bz_b(v16u8);
+extern int __builtin_msa_bz_h(v8u16);
+extern int __builtin_msa_bz_w(v4u32);
+extern int __builtin_msa_bz_d(v2u64);
+extern v16i8 __builtin_msa_ldi_b(short);
+extern v8i16 __builtin_msa_ldi_h(short);
+extern v4i32 __builtin_msa_ldi_w(short);
+extern v2i64 __builtin_msa_ldi_d(short);
+extern v4i32 __builtin_msa_fcaf_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcaf_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcor_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcor_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcun_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcun_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcune_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcune_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcueq_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcueq_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fceq_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fceq_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcne_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcne_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fclt_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fclt_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcult_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcult_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcle_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcle_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fcule_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fcule_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsaf_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsaf_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsor_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsor_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsun_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsun_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsune_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsune_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsueq_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsueq_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fseq_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fseq_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsne_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsne_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fslt_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fslt_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsult_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsult_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsle_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsle_d(v2f64, v2f64);
+extern v4i32 __builtin_msa_fsule_w(v4f32, v4f32);
+extern v2i64 __builtin_msa_fsule_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fadd_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fadd_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fsub_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fsub_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmul_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fmul_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fdiv_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fdiv_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmadd_w(v4f32, v4f32, v4f32);
+extern v2f64 __builtin_msa_fmadd_d(v2f64, v2f64, v2f64);
+extern v4f32 __builtin_msa_fmsub_w(v4f32, v4f32, v4f32);
+extern v2f64 __builtin_msa_fmsub_d(v2f64, v2f64, v2f64);
+extern v4f32 __builtin_msa_fexp2_w(v4f32, v4i32);
+extern v2f64 __builtin_msa_fexp2_d(v2f64, v2i64);
+extern v8i16 __builtin_msa_fexdo_h(v4f32, v4f32);
+extern v4f32 __builtin_msa_fexdo_w(v2f64, v2f64);
+extern v8i16 __builtin_msa_ftq_h(v4f32, v4f32);
+extern v4i32 __builtin_msa_ftq_w(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmin_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fmin_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmin_a_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fmin_a_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmax_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fmax_d(v2f64, v2f64);
+extern v4f32 __builtin_msa_fmax_a_w(v4f32, v4f32);
+extern v2f64 __builtin_msa_fmax_a_d(v2f64, v2f64);
+extern v8i16 __builtin_msa_mul_q_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_mul_q_w(v4i32, v4i32);
+extern v8i16 __builtin_msa_mulr_q_h(v8i16, v8i16);
+extern v4i32 __builtin_msa_mulr_q_w(v4i32, v4i32);
+extern v8i16 __builtin_msa_madd_q_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_madd_q_w(v4i32, v4i32, v4i32);
+extern v8i16 __builtin_msa_maddr_q_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_maddr_q_w(v4i32, v4i32, v4i32);
+extern v8i16 __builtin_msa_msub_q_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_msub_q_w(v4i32, v4i32, v4i32);
+extern v8i16 __builtin_msa_msubr_q_h(v8i16, v8i16, v8i16);
+extern v4i32 __builtin_msa_msubr_q_w(v4i32, v4i32, v4i32);
+extern v4i32 __builtin_msa_fclass_w(v4f32);
+extern v2i64 __builtin_msa_fclass_d(v2f64);
+extern v4f32 __builtin_msa_fsqrt_w(v4f32);
+extern v2f64 __builtin_msa_fsqrt_d(v2f64);
+extern v4f32 __builtin_msa_frcp_w(v4f32);
+extern v2f64 __builtin_msa_frcp_d(v2f64);
+extern v4f32 __builtin_msa_frint_w(v4f32);
+extern v2f64 __builtin_msa_frint_d(v2f64);
+extern v4f32 __builtin_msa_frsqrt_w(v4f32);
+extern v2f64 __builtin_msa_frsqrt_d(v2f64);
+extern v4f32 __builtin_msa_flog2_w(v4f32);
+extern v2f64 __builtin_msa_flog2_d(v2f64);
+extern v4f32 __builtin_msa_fexupl_w(v8i16);
+extern v2f64 __builtin_msa_fexupl_d(v4f32);
+extern v4f32 __builtin_msa_fexupr_w(v8i16);
+extern v2f64 __builtin_msa_fexupr_d(v4f32);
+extern v4f32 __builtin_msa_ffql_w(v8i16);
+extern v2f64 __builtin_msa_ffql_d(v4i32);
+extern v4f32 __builtin_msa_ffqr_w(v8i16);
+extern v2f64 __builtin_msa_ffqr_d(v4i32);
+extern v4i32 __builtin_msa_ftint_s_w(v4f32);
+extern v2i64 __builtin_msa_ftint_s_d(v2f64);
+extern v4u32 __builtin_msa_ftint_u_w(v4f32);
+extern v2u64 __builtin_msa_ftint_u_d(v2f64);
+extern v4i32 __builtin_msa_ftrunc_s_w(v4f32);
+extern v2i64 __builtin_msa_ftrunc_s_d(v2f64);
+extern v4u32 __builtin_msa_ftrunc_u_w(v4f32);
+extern v2u64 __builtin_msa_ftrunc_u_d(v2f64);
+extern v4f32 __builtin_msa_ffint_s_w(v4i32);
+extern v2f64 __builtin_msa_ffint_s_d(v2i64);
+extern v4f32 __builtin_msa_ffint_u_w(v4u32);
+extern v2f64 __builtin_msa_ffint_u_d(v2u64);
+extern int __builtin_msa_cfcmsa(unsigned char);
+extern v16i8 __builtin_msa_move_v(v16i8);
+extern v4f32 __builtin_msa_cast_to_vector_float(float);
+extern v2f64 __builtin_msa_cast_to_vector_double(double);
+extern float __builtin_msa_cast_to_scalar_float(v4f32);
+extern double __builtin_msa_cast_to_scalar_double(v2f64);
+#endif /* __clang__ */
+#define __msa_sll_b __builtin_msa_sll_b
+#define __msa_sll_h __builtin_msa_sll_h
+#define __msa_sll_w __builtin_msa_sll_w
+#define __msa_sll_d __builtin_msa_sll_d
+#define __msa_slli_b __builtin_msa_slli_b
+#define __msa_slli_h __builtin_msa_slli_h
+#define __msa_slli_w __builtin_msa_slli_w
+#define __msa_slli_d __builtin_msa_slli_d
+#define __msa_sra_b __builtin_msa_sra_b
+#define __msa_sra_h __builtin_msa_sra_h
+#define __msa_sra_w __builtin_msa_sra_w
+#define __msa_sra_d __builtin_msa_sra_d
+#define __msa_srai_b __builtin_msa_srai_b
+#define __msa_srai_h __builtin_msa_srai_h
+#define __msa_srai_w __builtin_msa_srai_w
+#define __msa_srai_d __builtin_msa_srai_d
+#define __msa_srar_b __builtin_msa_srar_b
+#define __msa_srar_h __builtin_msa_srar_h
+#define __msa_srar_w __builtin_msa_srar_w
+#define __msa_srar_d __builtin_msa_srar_d
+#define __msa_srari_b __builtin_msa_srari_b
+#define __msa_srari_h __builtin_msa_srari_h
+#define __msa_srari_w __builtin_msa_srari_w
+#define __msa_srari_d __builtin_msa_srari_d
+#define __msa_srl_b __builtin_msa_srl_b
+#define __msa_srl_h __builtin_msa_srl_h
+#define __msa_srl_w __builtin_msa_srl_w
+#define __msa_srl_d __builtin_msa_srl_d
+#define __msa_srli_b __builtin_msa_srli_b
+#define __msa_srli_h __builtin_msa_srli_h
+#define __msa_srli_w __builtin_msa_srli_w
+#define __msa_srli_d __builtin_msa_srli_d
+#define __msa_srlr_b __builtin_msa_srlr_b
+#define __msa_srlr_h __builtin_msa_srlr_h
+#define __msa_srlr_w __builtin_msa_srlr_w
+#define __msa_srlr_d __builtin_msa_srlr_d
+#define __msa_srlri_b __builtin_msa_srlri_b
+#define __msa_srlri_h __builtin_msa_srlri_h
+#define __msa_srlri_w __builtin_msa_srlri_w
+#define __msa_srlri_d __builtin_msa_srlri_d
+#define __msa_bclr_b __builtin_msa_bclr_b
+#define __msa_bclr_h __builtin_msa_bclr_h
+#define __msa_bclr_w __builtin_msa_bclr_w
+#define __msa_bclr_d __builtin_msa_bclr_d
+#define __msa_bclri_b __builtin_msa_bclri_b
+#define __msa_bclri_h __builtin_msa_bclri_h
+#define __msa_bclri_w __builtin_msa_bclri_w
+#define __msa_bclri_d __builtin_msa_bclri_d
+#define __msa_bset_b __builtin_msa_bset_b
+#define __msa_bset_h __builtin_msa_bset_h
+#define __msa_bset_w __builtin_msa_bset_w
+#define __msa_bset_d __builtin_msa_bset_d
+#define __msa_bseti_b __builtin_msa_bseti_b
+#define __msa_bseti_h __builtin_msa_bseti_h
+#define __msa_bseti_w __builtin_msa_bseti_w
+#define __msa_bseti_d __builtin_msa_bseti_d
+#define __msa_bneg_b __builtin_msa_bneg_b
+#define __msa_bneg_h __builtin_msa_bneg_h
+#define __msa_bneg_w __builtin_msa_bneg_w
+#define __msa_bneg_d __builtin_msa_bneg_d
+#define __msa_bnegi_b __builtin_msa_bnegi_b
+#define __msa_bnegi_h __builtin_msa_bnegi_h
+#define __msa_bnegi_w __builtin_msa_bnegi_w
+#define __msa_bnegi_d __builtin_msa_bnegi_d
+#define __msa_binsl_b __builtin_msa_binsl_b
+#define __msa_binsl_h __builtin_msa_binsl_h
+#define __msa_binsl_w __builtin_msa_binsl_w
+#define __msa_binsl_d __builtin_msa_binsl_d
+#define __msa_binsli_b __builtin_msa_binsli_b
+#define __msa_binsli_h __builtin_msa_binsli_h
+#define __msa_binsli_w __builtin_msa_binsli_w
+#define __msa_binsli_d __builtin_msa_binsli_d
+#define __msa_binsr_b __builtin_msa_binsr_b
+#define __msa_binsr_h __builtin_msa_binsr_h
+#define __msa_binsr_w __builtin_msa_binsr_w
+#define __msa_binsr_d __builtin_msa_binsr_d
+#define __msa_binsri_b __builtin_msa_binsri_b
+#define __msa_binsri_h __builtin_msa_binsri_h
+#define __msa_binsri_w __builtin_msa_binsri_w
+#define __msa_binsri_d __builtin_msa_binsri_d
+#define __msa_addv_b __builtin_msa_addv_b
+#define __msa_addv_h __builtin_msa_addv_h
+#define __msa_addv_w __builtin_msa_addv_w
+#define __msa_addv_d __builtin_msa_addv_d
+#define __msa_addvi_b __builtin_msa_addvi_b
+#define __msa_addvi_h __builtin_msa_addvi_h
+#define __msa_addvi_w __builtin_msa_addvi_w
+#define __msa_addvi_d __builtin_msa_addvi_d
+#define __msa_subv_b __builtin_msa_subv_b
+#define __msa_subv_h __builtin_msa_subv_h
+#define __msa_subv_w __builtin_msa_subv_w
+#define __msa_subv_d __builtin_msa_subv_d
+#define __msa_subvi_b __builtin_msa_subvi_b
+#define __msa_subvi_h __builtin_msa_subvi_h
+#define __msa_subvi_w __builtin_msa_subvi_w
+#define __msa_subvi_d __builtin_msa_subvi_d
+#define __msa_max_s_b __builtin_msa_max_s_b
+#define __msa_max_s_h __builtin_msa_max_s_h
+#define __msa_max_s_w __builtin_msa_max_s_w
+#define __msa_max_s_d __builtin_msa_max_s_d
+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
+#define __msa_max_u_b __builtin_msa_max_u_b
+#define __msa_max_u_h __builtin_msa_max_u_h
+#define __msa_max_u_w __builtin_msa_max_u_w
+#define __msa_max_u_d __builtin_msa_max_u_d
+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
+#define __msa_min_s_b __builtin_msa_min_s_b
+#define __msa_min_s_h __builtin_msa_min_s_h
+#define __msa_min_s_w __builtin_msa_min_s_w
+#define __msa_min_s_d __builtin_msa_min_s_d
+#define __msa_mini_s_b __builtin_msa_mini_s_b
+#define __msa_mini_s_h __builtin_msa_mini_s_h
+#define __msa_mini_s_w __builtin_msa_mini_s_w
+#define __msa_mini_s_d __builtin_msa_mini_s_d
+#define __msa_min_u_b __builtin_msa_min_u_b
+#define __msa_min_u_h __builtin_msa_min_u_h
+#define __msa_min_u_w __builtin_msa_min_u_w
+#define __msa_min_u_d __builtin_msa_min_u_d
+#define __msa_mini_u_b __builtin_msa_mini_u_b
+#define __msa_mini_u_h __builtin_msa_mini_u_h
+#define __msa_mini_u_w __builtin_msa_mini_u_w
+#define __msa_mini_u_d __builtin_msa_mini_u_d
+#define __msa_max_a_b __builtin_msa_max_a_b
+#define __msa_max_a_h __builtin_msa_max_a_h
+#define __msa_max_a_w __builtin_msa_max_a_w
+#define __msa_max_a_d __builtin_msa_max_a_d
+#define __msa_min_a_b __builtin_msa_min_a_b
+#define __msa_min_a_h __builtin_msa_min_a_h
+#define __msa_min_a_w __builtin_msa_min_a_w
+#define __msa_min_a_d __builtin_msa_min_a_d
+#define __msa_ceq_b __builtin_msa_ceq_b
+#define __msa_ceq_h __builtin_msa_ceq_h
+#define __msa_ceq_w __builtin_msa_ceq_w
+#define __msa_ceq_d __builtin_msa_ceq_d
+#define __msa_ceqi_b __builtin_msa_ceqi_b
+#define __msa_ceqi_h __builtin_msa_ceqi_h
+#define __msa_ceqi_w __builtin_msa_ceqi_w
+#define __msa_ceqi_d __builtin_msa_ceqi_d
+#define __msa_clt_s_b __builtin_msa_clt_s_b
+#define __msa_clt_s_h __builtin_msa_clt_s_h
+#define __msa_clt_s_w __builtin_msa_clt_s_w
+#define __msa_clt_s_d __builtin_msa_clt_s_d
+#define __msa_clti_s_b __builtin_msa_clti_s_b
+#define __msa_clti_s_h __builtin_msa_clti_s_h
+#define __msa_clti_s_w __builtin_msa_clti_s_w
+#define __msa_clti_s_d __builtin_msa_clti_s_d
+#define __msa_clt_u_b __builtin_msa_clt_u_b
+#define __msa_clt_u_h __builtin_msa_clt_u_h
+#define __msa_clt_u_w __builtin_msa_clt_u_w
+#define __msa_clt_u_d __builtin_msa_clt_u_d
+#define __msa_clti_u_b __builtin_msa_clti_u_b
+#define __msa_clti_u_h __builtin_msa_clti_u_h
+#define __msa_clti_u_w __builtin_msa_clti_u_w
+#define __msa_clti_u_d __builtin_msa_clti_u_d
+#define __msa_cle_s_b __builtin_msa_cle_s_b
+#define __msa_cle_s_h __builtin_msa_cle_s_h
+#define __msa_cle_s_w __builtin_msa_cle_s_w
+#define __msa_cle_s_d __builtin_msa_cle_s_d
+#define __msa_clei_s_b __builtin_msa_clei_s_b
+#define __msa_clei_s_h __builtin_msa_clei_s_h
+#define __msa_clei_s_w __builtin_msa_clei_s_w
+#define __msa_clei_s_d __builtin_msa_clei_s_d
+#define __msa_cle_u_b __builtin_msa_cle_u_b
+#define __msa_cle_u_h __builtin_msa_cle_u_h
+#define __msa_cle_u_w __builtin_msa_cle_u_w
+#define __msa_cle_u_d __builtin_msa_cle_u_d
+#define __msa_clei_u_b __builtin_msa_clei_u_b
+#define __msa_clei_u_h __builtin_msa_clei_u_h
+#define __msa_clei_u_w __builtin_msa_clei_u_w
+#define __msa_clei_u_d __builtin_msa_clei_u_d
+#define __msa_ld_b __builtin_msa_ld_b
+#define __msa_ld_h __builtin_msa_ld_h
+#define __msa_ld_w __builtin_msa_ld_w
+#define __msa_ld_d __builtin_msa_ld_d
+#define __msa_st_b __builtin_msa_st_b
+#define __msa_st_h __builtin_msa_st_h
+#define __msa_st_w __builtin_msa_st_w
+#define __msa_st_d __builtin_msa_st_d
+#define __msa_sat_s_b __builtin_msa_sat_s_b
+#define __msa_sat_s_h __builtin_msa_sat_s_h
+#define __msa_sat_s_w __builtin_msa_sat_s_w
+#define __msa_sat_s_d __builtin_msa_sat_s_d
+#define __msa_sat_u_b __builtin_msa_sat_u_b
+#define __msa_sat_u_h __builtin_msa_sat_u_h
+#define __msa_sat_u_w __builtin_msa_sat_u_w
+#define __msa_sat_u_d __builtin_msa_sat_u_d
+#define __msa_add_a_b __builtin_msa_add_a_b
+#define __msa_add_a_h __builtin_msa_add_a_h
+#define __msa_add_a_w __builtin_msa_add_a_w
+#define __msa_add_a_d __builtin_msa_add_a_d
+#define __msa_adds_a_b __builtin_msa_adds_a_b
+#define __msa_adds_a_h __builtin_msa_adds_a_h
+#define __msa_adds_a_w __builtin_msa_adds_a_w
+#define __msa_adds_a_d __builtin_msa_adds_a_d
+#define __msa_adds_s_b __builtin_msa_adds_s_b
+#define __msa_adds_s_h __builtin_msa_adds_s_h
+#define __msa_adds_s_w __builtin_msa_adds_s_w
+#define __msa_adds_s_d __builtin_msa_adds_s_d
+#define __msa_adds_u_b __builtin_msa_adds_u_b
+#define __msa_adds_u_h __builtin_msa_adds_u_h
+#define __msa_adds_u_w __builtin_msa_adds_u_w
+#define __msa_adds_u_d __builtin_msa_adds_u_d
+#define __msa_ave_s_b __builtin_msa_ave_s_b
+#define __msa_ave_s_h __builtin_msa_ave_s_h
+#define __msa_ave_s_w __builtin_msa_ave_s_w
+#define __msa_ave_s_d __builtin_msa_ave_s_d
+#define __msa_ave_u_b __builtin_msa_ave_u_b
+#define __msa_ave_u_h __builtin_msa_ave_u_h
+#define __msa_ave_u_w __builtin_msa_ave_u_w
+#define __msa_ave_u_d __builtin_msa_ave_u_d
+#define __msa_aver_s_b __builtin_msa_aver_s_b
+#define __msa_aver_s_h __builtin_msa_aver_s_h
+#define __msa_aver_s_w __builtin_msa_aver_s_w
+#define __msa_aver_s_d __builtin_msa_aver_s_d
+#define __msa_aver_u_b __builtin_msa_aver_u_b
+#define __msa_aver_u_h __builtin_msa_aver_u_h
+#define __msa_aver_u_w __builtin_msa_aver_u_w
+#define __msa_aver_u_d __builtin_msa_aver_u_d
+#define __msa_subs_s_b __builtin_msa_subs_s_b
+#define __msa_subs_s_h __builtin_msa_subs_s_h
+#define __msa_subs_s_w __builtin_msa_subs_s_w
+#define __msa_subs_s_d __builtin_msa_subs_s_d
+#define __msa_subs_u_b __builtin_msa_subs_u_b
+#define __msa_subs_u_h __builtin_msa_subs_u_h
+#define __msa_subs_u_w __builtin_msa_subs_u_w
+#define __msa_subs_u_d __builtin_msa_subs_u_d
+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
+#define __msa_asub_s_b __builtin_msa_asub_s_b
+#define __msa_asub_s_h __builtin_msa_asub_s_h
+#define __msa_asub_s_w __builtin_msa_asub_s_w
+#define __msa_asub_s_d __builtin_msa_asub_s_d
+#define __msa_asub_u_b __builtin_msa_asub_u_b
+#define __msa_asub_u_h __builtin_msa_asub_u_h
+#define __msa_asub_u_w __builtin_msa_asub_u_w
+#define __msa_asub_u_d __builtin_msa_asub_u_d
+#define __msa_mulv_b __builtin_msa_mulv_b
+#define __msa_mulv_h __builtin_msa_mulv_h
+#define __msa_mulv_w __builtin_msa_mulv_w
+#define __msa_mulv_d __builtin_msa_mulv_d
+#define __msa_maddv_b __builtin_msa_maddv_b
+#define __msa_maddv_h __builtin_msa_maddv_h
+#define __msa_maddv_w __builtin_msa_maddv_w
+#define __msa_maddv_d __builtin_msa_maddv_d
+#define __msa_msubv_b __builtin_msa_msubv_b
+#define __msa_msubv_h __builtin_msa_msubv_h
+#define __msa_msubv_w __builtin_msa_msubv_w
+#define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_div_s_b __builtin_msa_div_s_b
+#define __msa_div_s_h __builtin_msa_div_s_h
+#define __msa_div_s_w __builtin_msa_div_s_w
+#define __msa_div_s_d __builtin_msa_div_s_d
+#define __msa_div_u_b __builtin_msa_div_u_b
+#define __msa_div_u_h __builtin_msa_div_u_h
+#define __msa_div_u_w __builtin_msa_div_u_w
+#define __msa_div_u_d __builtin_msa_div_u_d
+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
+#define __msa_mod_s_b __builtin_msa_mod_s_b
+#define __msa_mod_s_h __builtin_msa_mod_s_h
+#define __msa_mod_s_w __builtin_msa_mod_s_w
+#define __msa_mod_s_d __builtin_msa_mod_s_d
+#define __msa_mod_u_b __builtin_msa_mod_u_b
+#define __msa_mod_u_h __builtin_msa_mod_u_h
+#define __msa_mod_u_w __builtin_msa_mod_u_w
+#define __msa_mod_u_d __builtin_msa_mod_u_d
+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
+#define __msa_sld_b __builtin_msa_sld_b
+#define __msa_sld_h __builtin_msa_sld_h
+#define __msa_sld_w __builtin_msa_sld_w
+#define __msa_sld_d __builtin_msa_sld_d
+#define __msa_sldi_b __builtin_msa_sldi_b
+#define __msa_sldi_h __builtin_msa_sldi_h
+#define __msa_sldi_w __builtin_msa_sldi_w
+#define __msa_sldi_d __builtin_msa_sldi_d
+#define __msa_splat_b __builtin_msa_splat_b
+#define __msa_splat_h __builtin_msa_splat_h
+#define __msa_splat_w __builtin_msa_splat_w
+#define __msa_splat_d __builtin_msa_splat_d
+#define __msa_splati_b __builtin_msa_splati_b
+#define __msa_splati_h __builtin_msa_splati_h
+#define __msa_splati_w __builtin_msa_splati_w
+#define __msa_splati_d __builtin_msa_splati_d
+#define __msa_pckev_b __builtin_msa_pckev_b
+#define __msa_pckev_h __builtin_msa_pckev_h
+#define __msa_pckev_w __builtin_msa_pckev_w
+#define __msa_pckev_d __builtin_msa_pckev_d
+#define __msa_pckod_b __builtin_msa_pckod_b
+#define __msa_pckod_h __builtin_msa_pckod_h
+#define __msa_pckod_w __builtin_msa_pckod_w
+#define __msa_pckod_d __builtin_msa_pckod_d
+#define __msa_ilvl_b __builtin_msa_ilvl_b
+#define __msa_ilvl_h __builtin_msa_ilvl_h
+#define __msa_ilvl_w __builtin_msa_ilvl_w
+#define __msa_ilvl_d __builtin_msa_ilvl_d
+#define __msa_ilvr_b __builtin_msa_ilvr_b
+#define __msa_ilvr_h __builtin_msa_ilvr_h
+#define __msa_ilvr_w __builtin_msa_ilvr_w
+#define __msa_ilvr_d __builtin_msa_ilvr_d
+#define __msa_ilvev_b __builtin_msa_ilvev_b
+#define __msa_ilvev_h __builtin_msa_ilvev_h
+#define __msa_ilvev_w __builtin_msa_ilvev_w
+#define __msa_ilvev_d __builtin_msa_ilvev_d
+#define __msa_ilvod_b __builtin_msa_ilvod_b
+#define __msa_ilvod_h __builtin_msa_ilvod_h
+#define __msa_ilvod_w __builtin_msa_ilvod_w
+#define __msa_ilvod_d __builtin_msa_ilvod_d
+#define __msa_vshf_b __builtin_msa_vshf_b
+#define __msa_vshf_h __builtin_msa_vshf_h
+#define __msa_vshf_w __builtin_msa_vshf_w
+#define __msa_vshf_d __builtin_msa_vshf_d
+#define __msa_and_v __builtin_msa_and_v
+#define __msa_andi_b __builtin_msa_andi_b
+#define __msa_or_v __builtin_msa_or_v
+#define __msa_ori_b __builtin_msa_ori_b
+#define __msa_nor_v __builtin_msa_nor_v
+#define __msa_nori_b __builtin_msa_nori_b
+#define __msa_xor_v __builtin_msa_xor_v
+#define __msa_xori_b __builtin_msa_xori_b
+#define __msa_bmnz_v __builtin_msa_bmnz_v
+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
+#define __msa_bmz_v __builtin_msa_bmz_v
+#define __msa_bmzi_b __builtin_msa_bmzi_b
+#define __msa_bsel_v __builtin_msa_bsel_v
+#define __msa_bseli_b __builtin_msa_bseli_b
+#define __msa_shf_b __builtin_msa_shf_b
+#define __msa_shf_h __builtin_msa_shf_h
+#define __msa_shf_w __builtin_msa_shf_w
+#define __msa_test_bnz_v __builtin_msa_bnz_v
+#define __msa_test_bz_v __builtin_msa_bz_v
+#define __msa_fill_b __builtin_msa_fill_b
+#define __msa_fill_h __builtin_msa_fill_h
+#define __msa_fill_w __builtin_msa_fill_w
+#define __msa_fill_d __builtin_msa_fill_d
+#define __msa_pcnt_b __builtin_msa_pcnt_b
+#define __msa_pcnt_h __builtin_msa_pcnt_h
+#define __msa_pcnt_w __builtin_msa_pcnt_w
+#define __msa_pcnt_d __builtin_msa_pcnt_d
+#define __msa_nloc_b __builtin_msa_nloc_b
+#define __msa_nloc_h __builtin_msa_nloc_h
+#define __msa_nloc_w __builtin_msa_nloc_w
+#define __msa_nloc_d __builtin_msa_nloc_d
+#define __msa_nlzc_b __builtin_msa_nlzc_b
+#define __msa_nlzc_h __builtin_msa_nlzc_h
+#define __msa_nlzc_w __builtin_msa_nlzc_w
+#define __msa_nlzc_d __builtin_msa_nlzc_d
+#define __msa_copy_s_b __builtin_msa_copy_s_b
+#define __msa_copy_s_h __builtin_msa_copy_s_h
+#define __msa_copy_s_w __builtin_msa_copy_s_w
+#define __msa_copy_s_d __builtin_msa_copy_s_d
+#define __msa_copy_u_b __builtin_msa_copy_u_b
+#define __msa_copy_u_h __builtin_msa_copy_u_h
+#define __msa_copy_u_w __builtin_msa_copy_u_w
+#define __msa_copy_u_d __builtin_msa_copy_u_d
+#define __msa_insert_b __builtin_msa_insert_b
+#define __msa_insert_h __builtin_msa_insert_h
+#define __msa_insert_w __builtin_msa_insert_w
+#define __msa_insert_d __builtin_msa_insert_d
+#define __msa_insve_b __builtin_msa_insve_b
+#define __msa_insve_h __builtin_msa_insve_h
+#define __msa_insve_w __builtin_msa_insve_w
+#define __msa_insve_d __builtin_msa_insve_d
+#define __msa_test_bnz_b __builtin_msa_bnz_b
+#define __msa_test_bnz_h __builtin_msa_bnz_h
+#define __msa_test_bnz_w __builtin_msa_bnz_w
+#define __msa_test_bnz_d __builtin_msa_bnz_d
+#define __msa_test_bz_b __builtin_msa_bz_b
+#define __msa_test_bz_h __builtin_msa_bz_h
+#define __msa_test_bz_w __builtin_msa_bz_w
+#define __msa_test_bz_d __builtin_msa_bz_d
+#define __msa_ldi_b __builtin_msa_ldi_b
+#define __msa_ldi_h __builtin_msa_ldi_h
+#define __msa_ldi_w __builtin_msa_ldi_w
+#define __msa_ldi_d __builtin_msa_ldi_d
+#define __msa_fcaf_w __builtin_msa_fcaf_w
+#define __msa_fcaf_d __builtin_msa_fcaf_d
+#define __msa_fcor_w __builtin_msa_fcor_w
+#define __msa_fcor_d __builtin_msa_fcor_d
+#define __msa_fcun_w __builtin_msa_fcun_w
+#define __msa_fcun_d __builtin_msa_fcun_d
+#define __msa_fcune_w __builtin_msa_fcune_w
+#define __msa_fcune_d __builtin_msa_fcune_d
+#define __msa_fcueq_w __builtin_msa_fcueq_w
+#define __msa_fcueq_d __builtin_msa_fcueq_d
+#define __msa_fceq_w __builtin_msa_fceq_w
+#define __msa_fceq_d __builtin_msa_fceq_d
+#define __msa_fcne_w __builtin_msa_fcne_w
+#define __msa_fcne_d __builtin_msa_fcne_d
+#define __msa_fclt_w __builtin_msa_fclt_w
+#define __msa_fclt_d __builtin_msa_fclt_d
+#define __msa_fcult_w __builtin_msa_fcult_w
+#define __msa_fcult_d __builtin_msa_fcult_d
+#define __msa_fcle_w __builtin_msa_fcle_w
+#define __msa_fcle_d __builtin_msa_fcle_d
+#define __msa_fcule_w __builtin_msa_fcule_w
+#define __msa_fcule_d __builtin_msa_fcule_d
+#define __msa_fsaf_w __builtin_msa_fsaf_w
+#define __msa_fsaf_d __builtin_msa_fsaf_d
+#define __msa_fsor_w __builtin_msa_fsor_w
+#define __msa_fsor_d __builtin_msa_fsor_d
+#define __msa_fsun_w __builtin_msa_fsun_w
+#define __msa_fsun_d __builtin_msa_fsun_d
+#define __msa_fsune_w __builtin_msa_fsune_w
+#define __msa_fsune_d __builtin_msa_fsune_d
+#define __msa_fsueq_w __builtin_msa_fsueq_w
+#define __msa_fsueq_d __builtin_msa_fsueq_d
+#define __msa_fseq_w __builtin_msa_fseq_w
+#define __msa_fseq_d __builtin_msa_fseq_d
+#define __msa_fsne_w __builtin_msa_fsne_w
+#define __msa_fsne_d __builtin_msa_fsne_d
+#define __msa_fslt_w __builtin_msa_fslt_w
+#define __msa_fslt_d __builtin_msa_fslt_d
+#define __msa_fsult_w __builtin_msa_fsult_w
+#define __msa_fsult_d __builtin_msa_fsult_d
+#define __msa_fsle_w __builtin_msa_fsle_w
+#define __msa_fsle_d __builtin_msa_fsle_d
+#define __msa_fsule_w __builtin_msa_fsule_w
+#define __msa_fsule_d __builtin_msa_fsule_d
+#define __msa_fadd_w __builtin_msa_fadd_w
+#define __msa_fadd_d __builtin_msa_fadd_d
+#define __msa_fsub_w __builtin_msa_fsub_w
+#define __msa_fsub_d __builtin_msa_fsub_d
+#define __msa_fmul_w __builtin_msa_fmul_w
+#define __msa_fmul_d __builtin_msa_fmul_d
+#define __msa_fdiv_w __builtin_msa_fdiv_w
+#define __msa_fdiv_d __builtin_msa_fdiv_d
+#define __msa_fmadd_w __builtin_msa_fmadd_w
+#define __msa_fmadd_d __builtin_msa_fmadd_d
+#define __msa_fmsub_w __builtin_msa_fmsub_w
+#define __msa_fmsub_d __builtin_msa_fmsub_d
+#define __msa_fexp2_w __builtin_msa_fexp2_w
+#define __msa_fexp2_d __builtin_msa_fexp2_d
+#define __msa_fexdo_h __builtin_msa_fexdo_h
+#define __msa_fexdo_w __builtin_msa_fexdo_w
+#define __msa_ftq_h __builtin_msa_ftq_h
+#define __msa_ftq_w __builtin_msa_ftq_w
+#define __msa_fmin_w __builtin_msa_fmin_w
+#define __msa_fmin_d __builtin_msa_fmin_d
+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
+#define __msa_fmax_w __builtin_msa_fmax_w
+#define __msa_fmax_d __builtin_msa_fmax_d
+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
+#define __msa_mul_q_h __builtin_msa_mul_q_h
+#define __msa_mul_q_w __builtin_msa_mul_q_w
+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
+#define __msa_madd_q_h __builtin_msa_madd_q_h
+#define __msa_madd_q_w __builtin_msa_madd_q_w
+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
+#define __msa_msub_q_h __builtin_msa_msub_q_h
+#define __msa_msub_q_w __builtin_msa_msub_q_w
+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
+#define __msa_fclass_w __builtin_msa_fclass_w
+#define __msa_fclass_d __builtin_msa_fclass_d
+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
+#define __msa_frcp_w __builtin_msa_frcp_w
+#define __msa_frcp_d __builtin_msa_frcp_d
+#define __msa_frint_w __builtin_msa_frint_w
+#define __msa_frint_d __builtin_msa_frint_d
+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
+#define __msa_flog2_w __builtin_msa_flog2_w
+#define __msa_flog2_d __builtin_msa_flog2_d
+#define __msa_fexupl_w __builtin_msa_fexupl_w
+#define __msa_fexupl_d __builtin_msa_fexupl_d
+#define __msa_fexupr_w __builtin_msa_fexupr_w
+#define __msa_fexupr_d __builtin_msa_fexupr_d
+#define __msa_ffql_w __builtin_msa_ffql_w
+#define __msa_ffql_d __builtin_msa_ffql_d
+#define __msa_ffqr_w __builtin_msa_ffqr_w
+#define __msa_ffqr_d __builtin_msa_ffqr_d
+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
+#define __msa_cfcmsa __builtin_msa_cfcmsa
+#define __msa_move_v __builtin_msa_move_v
+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
+#endif /* defined(__mips_msa) */
+#endif /* _MSA_H */
diff --git a/gcc/config/mips/mti-elf.h b/gcc/config/mips/mti-elf.h
index 76d289eaeeb..b1aede21785 100644
--- a/gcc/config/mips/mti-elf.h
+++ b/gcc/config/mips/mti-elf.h
@@ -34,6 +34,11 @@ along with GCC; see the file COPYING3.  If not see
      or -mgp setting.  */						\
   "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=n32}}",	\
 									\
+  /* If no FP ABI option is specified, infer one from the		\
+     ABI/ISA level.  */							\
+  "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{"	\
+  MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}",				\
+									\
   /* Make sure that an endian option is always present.  This makes	\
      things like LINK_SPEC easier to write.  */				\
   "%{!EB:%{!EL:%(endian_spec)}}",					\
diff --git a/gcc/config/mips/mti-linux.h b/gcc/config/mips/mti-linux.h
index db9896b4047..101c82a5a8a 100644
--- a/gcc/config/mips/mti-linux.h
+++ b/gcc/config/mips/mti-linux.h
@@ -19,8 +19,17 @@ along with GCC; see the file COPYING3.  If not see
 
 /* This target is a multilib target, specify the sysroot paths.  */
 #undef SYSROOT_SUFFIX_SPEC
+#if MIPS_ISA_DEFAULT == 33 /* mips32r2 is the default */
 #define SYSROOT_SUFFIX_SPEC \
-    "%{mips32:/mips32}%{mips64:/mips64}%{mips64r2:/mips64r2}%{mips16:/mips16}%{mmicromips:/micromips}%{mabi=64:/64}%{mel|EL:/el}%{msoft-float:/sof}%{mfp64:/fp64}%{mnan=2008:/nan2008}"
+    "%{muclibc:/uclibc}%{mips32:/mips32}%{mips64:/mips64}%{mips64r2:/mips64r2}%{mips32r6:/mips32r6}%{mips64r6:/mips64r6}%{mips16:/mips16}%{mmicromips:/micromips}%{mabi=64:/64}%{mel|EL:/el}%{msoft-float:/sof}%{!mips32r6:%{!mips64r6:%{mnan=2008:/nan2008}}}"
+#elif MIPS_ISA_DEFAULT == 37 /* mips32r6 is the default */
+#define SYSROOT_SUFFIX_SPEC \
+    "%{muclibc:/uclibc}%{mips32:/mips32}%{mips64:/mips64}%{mips32r2:/mips32r2}%{mips64r2:/mips64r2}%{mips64r6:/mips64r6}%{mips16:/mips16}%{mmicromips:/micromips}%{mabi=64:/64}%{mel|EL:/el}%{msoft-float:/sof}%{!mips32r6:%{!mips64r6:%{mnan=2008:/nan2008}}}"
+#else /* Unexpected default ISA.  */
+#error No SYSROOT_SUFFIX_SPEC exists for this default ISA
+#endif
+
+#define SYSROOT_HEADERS_SUFFIX_SPEC "%{muclibc:/uclibc}"
 
 #undef DRIVER_SELF_SPECS
 #define DRIVER_SELF_SPECS						\
@@ -39,8 +48,13 @@ along with GCC; see the file COPYING3.  If not see
      or -mgp setting.  */						\
   "%{!mabi=*: %{" MIPS_32BIT_OPTION_SPEC ": -mabi=32;: -mabi=n32}}",	\
 									\
+  /* If no FP ABI option is specified, infer one from the		\
+     ABI/ISA level unless there is a conflicting option.  */		\
+  "%{!msoft-float: %{!msingle-float: %{!mfp*: %{!mmsa: %{mabi=32: %{"	\
+  MIPS_FPXX_OPTION_SPEC ": -mfpxx}}}}}}",				\
+									\
   /* Base SPECs.  */							\
   BASE_DRIVER_SELF_SPECS						\
 									\
   /* Use the standard linux specs for everything else.  */		\
-  LINUX64_DRIVER_SELF_SPECS
+  LINUX_DRIVER_SELF_SPECS
diff --git a/gcc/config/mips/netbsd.h b/gcc/config/mips/netbsd.h
index efa28869b1b..ed41e0f837f 100644
--- a/gcc/config/mips/netbsd.h
+++ b/gcc/config/mips/netbsd.h
@@ -84,21 +84,13 @@ along with GCC; see the file COPYING3.  If not see
 	builtin_define ("__mips=3");				\
       else if (ISA_MIPS4)					\
 	builtin_define ("__mips=4");				\
-      else if (ISA_MIPS32)					\
-	{							\
-	  builtin_define ("__mips=32");				\
-	  builtin_define ("__mips_isa_rev=1");			\
-	}							\
-      else if (ISA_MIPS32R2)					\
-	{							\
-	  builtin_define ("__mips=32");				\
-	  builtin_define ("__mips_isa_rev=2");			\
-	}							\
-      else if (ISA_MIPS64)					\
-	{							\
-	  builtin_define ("__mips=64");				\
-	  builtin_define ("__mips_isa_rev=1");			\
-	}							\
+      else if (mips_isa >= 32 && mips_isa < 64)			\
+	builtin_define ("__mips=32");				\
+      else if (mips_isa >= 64)					\
+	builtin_define ("__mips=64");				\
+      if (mips_isa_rev > 0)					\
+        builtin_define_with_int_value ("__mips_isa_rev",	\
+                                       mips_isa_rev);		\
 								\
       if (TARGET_HARD_FLOAT)					\
 	builtin_define ("__mips_hard_float");			\
@@ -141,7 +133,8 @@ along with GCC; see the file COPYING3.  If not see
   "%{EL:-m elf32lmip} \
    %{EB:-m elf32bmip} \
    %(endian_spec) \
-   %{G*} %{mips1} %{mips2} %{mips3} %{mips4} %{mips32} %{mips32r2} %{mips64} \
+   %{G*} %{mips1} %{mips2} %{mips3} %{mips4} %{mips32} %{mips32r2} \
+   %{mips32r6} %{mips64} %{mips64r6} \
    %(netbsd_link_spec)"
 
 #define NETBSD_ENTRY_POINT "__start"
diff --git a/gcc/config/mips/octeon.md b/gcc/config/mips/octeon.md
index 1d6251c40a6..960894f0a29 100644
--- a/gcc/config/mips/octeon.md
+++ b/gcc/config/mips/octeon.md
@@ -22,41 +22,55 @@
 ;; Octeon is a dual-issue processor that can issue all instructions on
 ;; pipe0 and a subset on pipe1.
 
-(define_automaton "octeon_main, octeon_mult")
+(define_automaton "octeon_main, octeon_mult, octeon_fpu")
 
 (define_cpu_unit "octeon_pipe0" "octeon_main")
 (define_cpu_unit "octeon_pipe1" "octeon_main")
 (define_cpu_unit "octeon_mult" "octeon_mult")
+(define_cpu_unit "octeon_fpu" "octeon_fpu")
 
 (define_insn_reservation "octeon_arith" 1
-  (and (eq_attr "cpu" "octeon,octeon2")
+  (and (eq_attr "cpu" "octeon,octeon2,octeon3")
        (eq_attr "type" "arith,const,logical,move,shift,signext,slt,nop"))
   "octeon_pipe0 | octeon_pipe1")
 
-(define_insn_reservation "octeon_condmove" 2
-  (and (eq_attr "cpu" "octeon,octeon2")
+(define_insn_reservation "octeon_condmove_o1" 2
+  (and (eq_attr "cpu" "octeon")
        (eq_attr "type" "condmove"))
   "octeon_pipe0 | octeon_pipe1")
 
+(define_insn_reservation "octeon_condmove_o2" 3
+  (and (eq_attr "cpu" "octeon2,octeon3")
+       (eq_attr "type" "condmove")
+       (not (eq_attr "mode" "SF, DF")))
+  "octeon_pipe0 | octeon_pipe1")
+
+;; movt/movf can only issue in pipe1
+(define_insn_reservation "octeon_condmove_o3_int_on_cc" 3
+  (and (eq_attr "cpu" "octeon2,octeon3")
+       (eq_attr "type" "condmove")
+       (not (eq_attr "mode" "SF, DF")))
+  "octeon_pipe1")
+
 (define_insn_reservation "octeon_load_o1" 2
   (and (eq_attr "cpu" "octeon")
        (eq_attr "type" "load,prefetch,mtc,mfc"))
   "octeon_pipe0")
 
 (define_insn_reservation "octeon_load_o2" 3
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "load,prefetch"))
   "octeon_pipe0")
 
 ;; ??? memory-related cop0 reads are pipe0 with 3-cycle latency.
 ;; Front-end-related ones are 1-cycle on pipe1.  Assume front-end for now.
 (define_insn_reservation "octeon_cop_o2" 1
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "mtc,mfc"))
   "octeon_pipe1")
 
 (define_insn_reservation "octeon_store" 1
-  (and (eq_attr "cpu" "octeon,octeon2")
+  (and (eq_attr "cpu" "octeon,octeon2,octeon3")
        (eq_attr "type" "store"))
   "octeon_pipe0")
 
@@ -66,7 +80,7 @@
   "octeon_pipe0")
 
 (define_insn_reservation "octeon_brj_o2" 2
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "branch,jump,call,trap"))
   "octeon_pipe1")
 
@@ -76,7 +90,7 @@
   "(octeon_pipe0 | octeon_pipe1) + octeon_mult")
 
 (define_insn_reservation "octeon_imul3_o2" 6
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "imul3,pop,clz"))
   "octeon_pipe1 + octeon_mult")
 
@@ -86,7 +100,7 @@
   "(octeon_pipe0 | octeon_pipe1) + octeon_mult, octeon_mult")
 
 (define_insn_reservation "octeon_imul_o2" 1
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "imul,mthi,mtlo"))
   "octeon_pipe1 + octeon_mult")
 
@@ -96,7 +110,7 @@
   "(octeon_pipe0 | octeon_pipe1) + octeon_mult")
 
 (define_insn_reservation "octeon_mfhilo_o2" 6
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "mfhi,mflo"))
   "octeon_pipe1 + octeon_mult")
 
@@ -106,7 +120,7 @@
   "(octeon_pipe0 | octeon_pipe1) + octeon_mult, octeon_mult*3")
 
 (define_insn_reservation "octeon_imadd_o2" 1
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "type" "imadd"))
   "octeon_pipe1 + octeon_mult")
 
@@ -116,13 +130,13 @@
   "(octeon_pipe0 | octeon_pipe1) + octeon_mult, octeon_mult*71")
 
 (define_insn_reservation "octeon_idiv_o2_si" 18
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "mode" "SI")
        (eq_attr "type" "idiv"))
   "octeon_pipe1 + octeon_mult, octeon_mult*17")
 
 (define_insn_reservation "octeon_idiv_o2_di" 35
-  (and (eq_attr "cpu" "octeon2")
+  (and (eq_attr "cpu" "octeon2,octeon3")
        (eq_attr "mode" "DI")
        (eq_attr "type" "idiv"))
   "octeon_pipe1 + octeon_mult, octeon_mult*34")
@@ -131,6 +145,95 @@
 ;; patterns.
 
 (define_insn_reservation "octeon_unknown" 1
-  (and (eq_attr "cpu" "octeon,octeon2")
+  (and (eq_attr "cpu" "octeon,octeon2,octeon3")
        (eq_attr "type" "unknown,multi,atomic,syncloop"))
   "octeon_pipe0 + octeon_pipe1")
+
+;; Octeon3 FPU
+
+(define_insn_reservation "octeon3_faddsubcvt" 4
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fadd, fcvt"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon3_fmul" 5
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fmul"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon3_fmadd" 9
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fmadd"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu")
+
+(define_insn_reservation "octeon3_div_sf" 12
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fdiv, frdiv")
+       (eq_attr "mode" "SF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*8")
+
+(define_insn_reservation "octeon3_div_df" 22
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fdiv, frdiv")
+       (eq_attr "mode" "SF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*18")
+
+(define_insn_reservation "octeon3_sqrt_sf" 16
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fsqrt")
+       (eq_attr "mode" "SF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*12")
+
+(define_insn_reservation "octeon3_sqrt_df" 30
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fsqrt")
+       (eq_attr "mode" "DF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*26")
+
+(define_insn_reservation "octeon3_rsqrt_sf" 27
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "frsqrt")
+       (eq_attr "mode" "SF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*23")
+
+(define_insn_reservation "octeon3_rsqrt_df" 51
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "frsqrt")
+       (eq_attr "mode" "DF"))
+  "octeon_pipe1 + octeon_fpu, octeon_fpu*47")
+
+(define_insn_reservation "octeon3_fabsnegmov" 2
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fabs, fneg, fmove"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon_fcond" 1
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fcmp"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon_fcondmov" 2
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "condmove")
+       (eq_attr "mode" "SF,DF"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon_fpmtc1" 2
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "mtc"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon_fpmfc1" 6
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "mtc"))
+  "octeon_pipe1 + octeon_fpu")
+
+(define_insn_reservation "octeon_fpload" 3
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fpload,fpidxload"))
+  "octeon_pipe0 + octeon_fpu")
+
+(define_insn_reservation "octeon_fpstore" 3
+  (and (eq_attr "cpu" "octeon3")
+       (eq_attr "type" "fpstore,fpidxstore"))
+  "octeon_pipe0 + octeon_pipe1")
diff --git a/gcc/config/mips/p5600.md b/gcc/config/mips/p5600.md
new file mode 100644
index 00000000000..d672dc40113
--- /dev/null
+++ b/gcc/config/mips/p5600.md
@@ -0,0 +1,351 @@
+;; DFA-based pipeline description for P5600.
+;;
+;; Copyright (C) 2007-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published
+;; by the Free Software Foundation; either version 3, or (at your
+;; option) any later version.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "p5600_agen_alq_pipe, p5600_fpu_pipe")
+
+;; The address generation queue (AGQ) has AL2, CTISTD and LDSTA pipes
+(define_cpu_unit "p5600_agq, p5600_al2, p5600_ctistd, p5600_ldsta,
+		  p5600_gpdiv" "p5600_agen_alq_pipe")
+
+;; The arithmetic-logic-unit queue (ALQ) has ALU pipe
+(define_cpu_unit "p5600_alq, p5600_alu" "p5600_agen_alq_pipe")
+
+;; The floating-point-unit queue (FPQ) has short and long pipes
+(define_cpu_unit "p5600_fpu_short, p5600_fpu_long" "p5600_fpu_pipe")
+
+;; Short FPU pipeline.
+(define_cpu_unit "p5600_fpu_intadd, p5600_fpu_cmp, p5600_fpu_float,
+		  p5600_fpu_logic_a, p5600_fpu_logic_b, p5600_fpu_div,
+		  p5600_fpu_store" "p5600_fpu_pipe")
+
+;; Long FPU pipeline.
+(define_cpu_unit "p5600_fpu_logic, p5600_fpu_float_a, p5600_fpu_float_b,
+		  p5600_fpu_float_c, p5600_fpu_float_d" "p5600_fpu_pipe")
+(define_cpu_unit "p5600_fpu_mult, p5600_fpu_fdiv, p5600_fpu_load,
+		  p5600_fpu_apu" "p5600_fpu_pipe")
+
+(define_reservation "p5600_agq_al2" "p5600_agq, p5600_al2")
+(define_reservation "p5600_agq_ctistd" "p5600_agq, p5600_ctistd")
+(define_reservation "p5600_agq_ldsta" "p5600_agq, p5600_ldsta")
+(define_reservation "p5600_alq_alu" "p5600_alq, p5600_alu")
+
+;;
+;; FPU-MSA pipe
+;;
+
+;; Arithmetic
+;; add, hadd, sub, hsub, average, min, max, compare
+(define_insn_reservation "msa_short_int_add" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_int_add"))
+  "p5600_fpu_short, p5600_fpu_intadd")
+
+;; Bitwise Instructions
+;; and, or, xor, bit-clear, leading-bits-count, shift, shuffle
+(define_insn_reservation "msa_short_logic" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic"))
+  "p5600_fpu_short, p5600_fpu_logic_a")
+
+;; move.v
+(define_insn_reservation "msa_short_logic_move_v" 2
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fmove")
+	    (eq_attr "mode" "TI")))
+  "p5600_fpu_short, p5600_fpu_logic_a")
+
+;; Float compare
+(define_insn_reservation "msa_short_cmp" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_cmp"))
+  "p5600_fpu_short, p5600_fpu_cmp")
+
+;; Float exp2, min, max
+(define_insn_reservation "msa_short_float2" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float2"))
+  "p5600_fpu_short, p5600_fpu_float")
+
+;; Vector sat
+(define_insn_reservation "msa_short_logic3" 3
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic3"))
+  "p5600_fpu_short, p5600_fpu_logic_a, p5600_fpu_logic_b")
+
+;; Vector copy, bz, bnz
+(define_insn_reservation "msa_short_store4" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_store4"))
+  "p5600_fpu_short, p5600_fpu_store")
+
+;; Vector load
+(define_insn_reservation "msa_long_load" 10
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fpload")
+	    (eq_attr "mode" "TI")))
+  "p5600_fpu_long, p5600_fpu_load")
+
+;; Vector store
+(define_insn_reservation "msa_short_store" 2
+  (and (eq_attr "cpu" "p5600")
+       (and (eq_attr "type" "fpstore")
+	    (eq_attr "mode" "TI")))
+  "p5600_fpu_short, p5600_fpu_store")
+
+;; binsl, binsr, insert, vshf, sld
+(define_insn_reservation "msa_long_logic" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_logic_l"))
+  "p5600_fpu_long, p5600_fpu_logic")
+
+;; Float fclass, flog2
+(define_insn_reservation "msa_long_float2" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float2_l"))
+  "p5600_fpu_long, p5600_fpu_float_a")
+
+;; fadd, fsub
+(define_insn_reservation "msa_long_float4" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float4"))
+  "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b")
+
+;; fmul
+(define_insn_reservation "msa_long_float5" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float5"))
+  "p5600_fpu_long, p5600_fpu_float_a, p5600_fpu_float_b, p5600_fpu_float_c")
+
+;; fmadd, fmsub
+(define_insn_reservation "msa_long_float8" 8
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_float8"))
+  "p5600_fpu_long, p5600_fpu_float_a,
+   p5600_fpu_float_b, p5600_fpu_float_c, p5600_fpu_float_d")
+
+;; Vector mul, dotp, madd, msub
+(define_insn_reservation "msa_long_mult" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_mult"))
+  "p5600_fpu_long, p5600_fpu_mult")
+
+;; fdiv, fmod (semi-pipelined)
+(define_insn_reservation "msa_long_fdiv" 10
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_fdiv"))
+  "p5600_fpu_long, nothing, nothing, p5600_fpu_fdiv*8")
+
+;; div, mod (non-pipelined)
+(define_insn_reservation "msa_long_div" 10
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "msa_execunit" "msa_eu_div"))
+  "p5600_fpu_long, p5600_fpu_div*9, p5600_fpu_div + p5600_fpu_logic_a")
+
+;;
+;; FPU pipe
+;;
+
+;; fadd, fsub
+(define_insn_reservation "p5600_fpu_fadd" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fadd,fabs,fneg"))
+  "p5600_fpu_long, p5600_fpu_apu")
+
+;; fabs, fneg, fcmp
+(define_insn_reservation "p5600_fpu_fabs" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fabs,fneg,fcmp,fmove"))
+  "p5600_fpu_short, p5600_fpu_apu")
+
+;; fload
+(define_insn_reservation "p5600_fpu_fload" 8
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fpload,fpidxload"))
+  "p5600_fpu_long, p5600_fpu_apu")
+
+;; fstore
+(define_insn_reservation "p5600_fpu_fstore" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fpstore,fpidxstore"))
+  "p5600_fpu_short, p5600_fpu_apu")
+
+;; fmadd
+(define_insn_reservation "p5600_fpu_fmadd" 9
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fmadd"))
+  "p5600_fpu_long, p5600_fpu_apu")
+
+;; fmul
+(define_insn_reservation "p5600_fpu_fmul" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fmul"))
+  "p5600_fpu_long, p5600_fpu_apu")
+
+;; fdiv, fsqrt
+(define_insn_reservation "p5600_fpu_div" 17
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fdiv,frdiv,fsqrt,frsqrt"))
+  "p5600_fpu_long, p5600_fpu_apu*17")
+
+;; fcvt
+(define_insn_reservation "p5600_fpu_fcvt" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "fcvt"))
+  "p5600_fpu_long, p5600_fpu_apu")
+
+;; mtc
+(define_insn_reservation "p5600_fpu_fmtc" 7
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mtc"))
+  "p5600_fpu_short, p5600_fpu_store")
+
+;; mfc
+(define_insn_reservation "p5600_fpu_fmfc" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mfc"))
+  "p5600_fpu_short, p5600_fpu_store")
+
+;; madd/msub feeding into the add source
+;; madd.fmt dst, x, y, z -> madd.fmt a, dst, b, c 5 cycles
+(define_bypass 5 "p5600_fpu_fmadd" "p5600_fpu_fmadd" "mips_fmadd_bypass")
+
+;;
+;; Integer pipe
+;;
+
+;; and
+(define_insn_reservation "p5600_int_and" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "logical"))
+  "p5600_alq_alu")
+
+;; lui
+(define_insn_reservation "p5600_int_lui" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "const"))
+  "p5600_alq_alu")
+
+;; Load lb, lbu, lh, lhu, lq, lw, lw_i2f, lwxs
+(define_insn_reservation "p5600_int_load" 4
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "load"))
+  "p5600_agq_ldsta")
+
+;; store
+(define_insn_reservation "p5600_int_store" 3
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "store"))
+  "p5600_agq_ldsta")
+
+;; andi, sll, srl, seb, seh
+(define_insn_reservation "p5600_int_arith_1" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "move_type" "andi,sll0,signext"))
+  "p5600_alq_alu | p5600_agq_al2")
+
+;; addi, addiu, ori, xori, add, addu
+(define_insn_reservation "p5600_int_arith_2" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "alu_type" "add,or,xor"))
+  "p5600_alq_alu | p5600_agq_al2")
+
+;; nor, sub
+(define_insn_reservation "p5600_int_arith_3" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "alu_type" "nor,sub"))
+  "p5600_alq_alu")
+
+;; srl, sra, rotr, slt, sllv, srlv
+(define_insn_reservation "p5600_int_arith_4" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "shift,slt,move"))
+  "p5600_alq_alu | p5600_agq_al2")
+
+;; nop
+(define_insn_reservation "p5600_int_nop" 0
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "nop"))
+  "p5600_agq_al2")
+
+;; clo, clz
+(define_insn_reservation "p5600_int_countbits" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "clz"))
+  "p5600_agq_al2")
+
+;; Conditional moves
+(define_insn_reservation "p5600_int_condmove" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "condmove"))
+  "p5600_agq_al2")
+
+;; madd, msub
+(define_insn_reservation "p5600_dsp_mac" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "imadd"))
+  "p5600_agq_al2")
+
+;; mfhi/lo
+(define_insn_reservation "p5600_dsp_mfhilo" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mfhi,mflo"))
+  "p5600_agq_al2")
+
+;; mthi/lo
+(define_insn_reservation "p5600_dsp_mthilo" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "mthi,mtlo"))
+  "p5600_agq_al2")
+
+;; mult, multu, mul
+(define_insn_reservation "p5600_dsp_mult" 5
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "imul3,imul"))
+  "p5600_agq_al2")
+
+;; branch and jump
+(define_insn_reservation "p5600_int_branch" 1
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "branch,jump"))
+  "p5600_agq_ctistd")
+
+;; prefetch
+(define_insn_reservation "p5600_int_prefetch" 3
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "prefetch,prefetchx"))
+  "p5600_agq_ldsta")
+
+;; divide
+(define_insn_reservation "p5600_int_div" 8
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "idiv"))
+  "p5600_agq_al2+p5600_gpdiv*8")
+
+;; arith
+(define_insn_reservation "p5600_int_arith_5" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "type" "arith"))
+  "p5600_agq_al2")
+
+;; call
+(define_insn_reservation "p5600_int_call" 2
+  (and (eq_attr "cpu" "p5600")
+       (eq_attr "jal" "indirect,direct"))
+  "p5600_agq_ctistd")
diff --git a/gcc/config/mips/predicates.md b/gcc/config/mips/predicates.md
index 8ac8e0b6a5b..3a08e59cfe5 100644
--- a/gcc/config/mips/predicates.md
+++ b/gcc/config/mips/predicates.md
@@ -33,10 +33,38 @@
   (ior (match_operand 0 "const_arith_operand")
        (match_operand 0 "register_operand")))
 
+(define_predicate "const_immlsa_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (exact_log2 (INTVAL (op)), 1, 4)")))
+
+(define_predicate "const_msa_branch_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), -1024, 1023)")))
+
+(define_predicate "const_uimm3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
+
+(define_predicate "const_uimm4_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
+
+(define_predicate "const_uimm5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
 (define_predicate "const_uimm6_operand"
   (and (match_code "const_int")
        (match_test "UIMM6_OPERAND (INTVAL (op))")))
 
+(define_predicate "const_uimm8_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 255)")))
+
+(define_predicate "const_imm5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), -16, 15)")))
+
 (define_predicate "const_imm10_operand"
   (and (match_code "const_int")
        (match_test "IMM10_OPERAND (INTVAL (op))")))
@@ -45,6 +73,22 @@
   (ior (match_operand 0 "const_imm10_operand")
        (match_operand 0 "register_operand")))
 
+(define_predicate "aq10b_operand"
+  (and (match_code "const_int")
+       (match_test "mips_signed_immediate_p (INTVAL (op), 10, 0)")))
+
+(define_predicate "aq10h_operand"
+  (and (match_code "const_int")
+       (match_test "mips_signed_immediate_p (INTVAL (op), 10, 1)")))
+
+(define_predicate "aq10w_operand"
+  (and (match_code "const_int")
+       (match_test "mips_signed_immediate_p (INTVAL (op), 10, 2)")))
+
+(define_predicate "aq10d_operand"
+  (and (match_code "const_int")
+       (match_test "mips_signed_immediate_p (INTVAL (op), 10, 3)")))
+
 (define_predicate "sle_operand"
   (and (match_code "const_int")
        (match_test "SMALL_OPERAND (INTVAL (op) + 1)")))
@@ -57,11 +101,29 @@
   (and (match_code "const_int,const_double,const_vector")
        (match_test "op == CONST0_RTX (GET_MODE (op))")))
 
+(define_predicate "const_yi_operand"
+  (and (match_code "const_vector")
+       (match_test "mips_const_vector_same_int_p (op, mode, -1024, 1023)")))
+
+(define_predicate "const_m1_operand"
+  (and (match_code "const_int,const_double,const_vector")
+       (match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+
+(define_predicate "reg_or_m1_operand"
+  (ior (match_operand 0 "const_m1_operand")
+       (match_operand 0 "register_operand")))
+
 (define_predicate "reg_or_0_operand"
   (ior (and (match_operand 0 "const_0_operand")
 	    (not (match_test "TARGET_MIPS16")))
        (match_operand 0 "register_operand")))
 
+(define_predicate "reg_or_0yi_operand"
+   (ior (and (ior (match_operand 0 "const_0_operand")
+	          (match_operand 0 "const_yi_operand"))
+	    (match_test "TARGET_MSA"))
+       (match_operand 0 "register_operand")))
+
 (define_predicate "const_1_operand"
   (and (match_code "const_int,const_double,const_vector")
        (match_test "op == CONST1_RTX (GET_MODE (op))")))
@@ -70,6 +132,23 @@
   (ior (match_operand 0 "const_1_operand")
        (match_operand 0 "register_operand")))
 
+;; These are used in vec_merge, hence accept bitmask as const_int.
+(define_predicate "const_exp_2_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (exact_log2 (INTVAL (op)), 0, 1)")))
+
+(define_predicate "const_exp_4_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (exact_log2 (INTVAL (op)), 0, 3)")))
+
+(define_predicate "const_exp_8_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (exact_log2 (INTVAL (op)), 0, 7)")))
+
+(define_predicate "const_exp_16_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (exact_log2 (INTVAL (op)), 0, 15)")))
+
 ;; This is used for indexing into vectors, and hence only accepts const_int.
 (define_predicate "const_0_or_1_operand"
   (and (match_code "const_int")
@@ -471,7 +550,18 @@
   (match_code "eq,ne,lt,ltu,ge,geu"))
 
 (define_predicate "order_operator"
-  (match_code "lt,ltu,le,leu,ge,geu,gt,gtu"))
+  (match_code "lt,ltu,le,leu,ge,geu,gt,gtu")
+{
+  if (XEXP (op, 1) == const0_rtx)
+    return true;
+
+  if (TARGET_CB_MAYBE
+      && (GET_CODE (op) == LT || GET_CODE (op) == LTU
+	  || GET_CODE (op) == GE || GET_CODE (op) == GEU))
+    return true;
+
+  return false;
+})
 
 ;; For NE, cstore uses sltu instructions in which the first operand is $0.
 ;; This isn't possible in mips16 code.
@@ -492,3 +582,197 @@
 (define_predicate "non_volatile_mem_operand"
   (and (match_operand 0 "memory_operand")
        (not (match_test "MEM_VOLATILE_P (op)"))))
+
+(define_predicate "const_vector_same_uimm3_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 7);
+})
+
+(define_predicate "const_vector_same_v16qi_set_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_set_p (op, mode);
+})
+
+(define_predicate "const_vector_same_v16qi_clr_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_clr_p (op, mode);
+})
+
+(define_predicate "const_vector_same_cmpsimm4_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, -16, 15);
+})
+
+(define_predicate "const_vector_same_cmpuimm4_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 31);
+})
+
+(define_predicate "const_vector_same_simm10_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, -1024, 1023);
+})
+
+(define_predicate "const_vector_same_uimm4_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 15);
+})
+
+(define_predicate "const_vector_same_v8hi_set_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_set_p (op, mode);
+})
+
+(define_predicate "const_vector_same_v8hi_clr_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_clr_p (op, mode);
+})
+
+(define_predicate "const_vector_same_v4si_set_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_set_p (op, mode);
+})
+
+(define_predicate "const_vector_same_v4si_clr_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_clr_p (op, mode);
+})
+
+(define_predicate "const_vector_same_uimm6_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 63);
+})
+
+(define_predicate "const_vector_same_v2di_set_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_set_p (op, mode);
+})
+
+(define_predicate "const_vector_same_v2di_clr_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_bitimm_clr_p (op, mode);
+})
+
+(define_predicate "const_vector_same_ximm5_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, -31, 31);
+})
+
+(define_predicate "const_vector_same_simm5_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, -32, 0);
+})
+
+(define_predicate "const_vector_same_uimm5_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 31);
+})
+
+(define_predicate "const_vector_same_uimm8_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_int_p (op, mode, 0, 255);
+})
+
+(define_predicate "const_vector_same_byte_operand"
+  (match_code "const_vector")
+{
+  return mips_const_vector_same_byte_p (op, mode);
+})
+
+(define_predicate "reg_or_vector_same_ximm5_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_ximm5_operand")))
+
+(define_predicate "reg_or_vector_same_simm5_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_simm5_operand")))
+
+(define_predicate "reg_or_vector_same_uimm5_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm5_operand")))
+
+(define_predicate "reg_or_vector_same_uimm3_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm3_operand")))
+
+(define_predicate "reg_or_vector_same_v16qi_set_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v16qi_set_operand")))
+
+(define_predicate "reg_or_vector_same_v16qi_clr_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v16qi_clr_operand")))
+
+(define_predicate "reg_or_vector_same_uimm4_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm4_operand")))
+
+(define_predicate "reg_or_vector_same_v8hi_set_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v8hi_set_operand")))
+
+(define_predicate "reg_or_vector_same_v8hi_clr_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v8hi_clr_operand")))
+
+(define_predicate "reg_or_vector_same_v4si_set_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v4si_set_operand")))
+
+(define_predicate "reg_or_vector_same_v4si_clr_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v4si_clr_operand")))
+
+(define_predicate "reg_or_vector_same_uimm6_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm6_operand")))
+
+(define_predicate "reg_or_vector_same_v2di_set_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v2di_set_operand")))
+
+(define_predicate "reg_or_vector_same_v2di_clr_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_v2di_clr_operand")))
+
+(define_predicate "reg_or_vector_same_uimm8_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm8_operand")))
+
+(define_predicate "reg_or_vector_same_byte_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_byte_operand")))
+
+(define_predicate "reg_or_vector_same_bitumm3_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm3_operand")))
+
+(define_predicate "reg_or_vector_same_bituimm4_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm4_operand")))
+
+(define_predicate "reg_or_vector_same_bituimm5_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm5_operand")))
+
+(define_predicate "reg_or_vector_same_bituimm6_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const_vector_same_uimm6_operand")))
diff --git a/gcc/config/mips/sync.md b/gcc/config/mips/sync.md
index cf6c05be27c..72d2fe49af5 100644
--- a/gcc/config/mips/sync.md
+++ b/gcc/config/mips/sync.md
@@ -59,7 +59,7 @@
 ;; Can be removed in favor of atomic_compare_and_swap below.
 (define_insn "sync_compare_and_swap<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "dJ,dJ")
 			      (match_operand:GPR 3 "arith_operand" "I,d")]
@@ -89,7 +89,7 @@
 ;; Helper insn for mips_expand_atomic_qihi.
 (define_insn "compare_and_swap_12"
   [(set (match_operand:SI 0 "register_operand" "=&d,&d")
-	(match_operand:SI 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:SI 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:SI [(match_operand:SI 2 "register_operand" "d,d")
 			     (match_operand:SI 3 "register_operand" "d,d")
@@ -106,7 +106,7 @@
    (set_attr "sync_insn1_op2" "5")])
 
 (define_insn "sync_add<mode>"
-  [(set (match_operand:GPR 0 "memory_operand" "+ZR,ZR")
+  [(set (match_operand:GPR 0 "memory_operand" "+ZC,ZC")
 	(unspec_volatile:GPR
           [(plus:GPR (match_dup 0)
 		     (match_operand:GPR 1 "arith_operand" "I,d"))]
@@ -134,7 +134,7 @@
 
 ;; Helper insn for sync_<optab><mode>
 (define_insn "sync_<optab>_12"
-  [(set (match_operand:SI 0 "memory_operand" "+ZR")
+  [(set (match_operand:SI 0 "memory_operand" "+ZC")
 	(unspec_volatile:SI
           [(match_operand:SI 1 "register_operand" "d")
 	   (match_operand:SI 2 "register_operand" "d")
@@ -174,7 +174,7 @@
 ;; Helper insn for sync_old_<optab><mode>
 (define_insn "sync_old_<optab>_12"
   [(set (match_operand:SI 0 "register_operand" "=&d")
-	(match_operand:SI 1 "memory_operand" "+ZR"))
+	(match_operand:SI 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
 	(unspec_volatile:SI
           [(match_operand:SI 2 "register_operand" "d")
@@ -217,7 +217,7 @@
 (define_insn "sync_new_<optab>_12"
   [(set (match_operand:SI 0 "register_operand" "=&d")
 	(unspec_volatile:SI
-          [(match_operand:SI 1 "memory_operand" "+ZR")
+          [(match_operand:SI 1 "memory_operand" "+ZC")
 	   (match_operand:SI 2 "register_operand" "d")
 	   (match_operand:SI 3 "register_operand" "d")
 	   (atomic_hiqi_op:SI (match_dup 0)
@@ -257,7 +257,7 @@
 
 ;; Helper insn for sync_nand<mode>
 (define_insn "sync_nand_12"
-  [(set (match_operand:SI 0 "memory_operand" "+ZR")
+  [(set (match_operand:SI 0 "memory_operand" "+ZC")
 	(unspec_volatile:SI
           [(match_operand:SI 1 "register_operand" "d")
 	   (match_operand:SI 2 "register_operand" "d")
@@ -296,7 +296,7 @@
 ;; Helper insn for sync_old_nand<mode>
 (define_insn "sync_old_nand_12"
   [(set (match_operand:SI 0 "register_operand" "=&d")
-	(match_operand:SI 1 "memory_operand" "+ZR"))
+	(match_operand:SI 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
 	(unspec_volatile:SI
           [(match_operand:SI 2 "register_operand" "d")
@@ -337,7 +337,7 @@
 (define_insn "sync_new_nand_12"
   [(set (match_operand:SI 0 "register_operand" "=&d")
 	(unspec_volatile:SI
-          [(match_operand:SI 1 "memory_operand" "+ZR")
+          [(match_operand:SI 1 "memory_operand" "+ZC")
 	   (match_operand:SI 2 "register_operand" "d")
 	   (match_operand:SI 3 "register_operand" "d")
 	   (match_operand:SI 4 "reg_or_0_operand" "dJ")]
@@ -360,7 +360,7 @@
    (set_attr "sync_insn1_op2" "4")])
 
 (define_insn "sync_sub<mode>"
-  [(set (match_operand:GPR 0 "memory_operand" "+ZR")
+  [(set (match_operand:GPR 0 "memory_operand" "+ZC")
 	(unspec_volatile:GPR
           [(minus:GPR (match_dup 0)
 		      (match_operand:GPR 1 "register_operand" "d"))]
@@ -374,7 +374,7 @@
 ;; Can be removed in favor of atomic_fetch_add below.
 (define_insn "sync_old_add<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR
           [(plus:GPR (match_dup 1)
@@ -389,7 +389,7 @@
 
 (define_insn "sync_old_sub<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR
           [(minus:GPR (match_dup 1)
@@ -404,7 +404,7 @@
 
 (define_insn "sync_new_add<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-        (plus:GPR (match_operand:GPR 1 "memory_operand" "+ZR,ZR")
+        (plus:GPR (match_operand:GPR 1 "memory_operand" "+ZC,ZC")
 		  (match_operand:GPR 2 "arith_operand" "I,d")))
    (set (match_dup 1)
 	(unspec_volatile:GPR
@@ -420,7 +420,7 @@
 
 (define_insn "sync_new_sub<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d")
-        (minus:GPR (match_operand:GPR 1 "memory_operand" "+ZR")
+        (minus:GPR (match_operand:GPR 1 "memory_operand" "+ZC")
 		   (match_operand:GPR 2 "register_operand" "d")))
    (set (match_dup 1)
 	(unspec_volatile:GPR
@@ -435,7 +435,7 @@
    (set_attr "sync_insn1_op2" "2")])
 
 (define_insn "sync_<optab><mode>"
-  [(set (match_operand:GPR 0 "memory_operand" "+ZR,ZR")
+  [(set (match_operand:GPR 0 "memory_operand" "+ZC,ZC")
 	(unspec_volatile:GPR
           [(fetchop_bit:GPR (match_operand:GPR 1 "uns_arith_operand" "K,d")
 			      (match_dup 0))]
@@ -448,7 +448,7 @@
 
 (define_insn "sync_old_<optab><mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR
           [(fetchop_bit:GPR (match_operand:GPR 2 "uns_arith_operand" "K,d")
@@ -463,7 +463,7 @@
 
 (define_insn "sync_new_<optab><mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR
           [(fetchop_bit:GPR (match_operand:GPR 2 "uns_arith_operand" "K,d")
@@ -478,7 +478,7 @@
    (set_attr "sync_insn1_op2" "2")])
 
 (define_insn "sync_nand<mode>"
-  [(set (match_operand:GPR 0 "memory_operand" "+ZR,ZR")
+  [(set (match_operand:GPR 0 "memory_operand" "+ZC,ZC")
 	(unspec_volatile:GPR [(match_operand:GPR 1 "uns_arith_operand" "K,d")]
 	 UNSPEC_SYNC_OLD_OP))]
   "GENERATE_LL_SC"
@@ -490,7 +490,7 @@
 
 (define_insn "sync_old_nand<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
         (unspec_volatile:GPR [(match_operand:GPR 2 "uns_arith_operand" "K,d")]
 	 UNSPEC_SYNC_OLD_OP))]
@@ -504,7 +504,7 @@
 
 (define_insn "sync_new_nand<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "uns_arith_operand" "K,d")]
 	 UNSPEC_SYNC_NEW_OP))]
@@ -519,7 +519,7 @@
 
 (define_insn "sync_lock_test_and_set<mode>"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(match_operand:GPR 1 "memory_operand" "+ZR,ZR"))
+	(match_operand:GPR 1 "memory_operand" "+ZC,ZC"))
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "arith_operand" "I,d")]
 	 UNSPEC_SYNC_EXCHANGE))]
@@ -546,7 +546,7 @@
 
 (define_insn "test_and_set_12"
   [(set (match_operand:SI 0 "register_operand" "=&d")
-	(match_operand:SI 1 "memory_operand" "+ZR"))
+	(match_operand:SI 1 "memory_operand" "+ZC"))
    (set (match_dup 1)
 	(unspec_volatile:SI [(match_operand:SI 2 "register_operand" "d")
 			     (match_operand:SI 3 "register_operand" "d")
@@ -576,7 +576,7 @@
 	;; TODO: the obscuring unspec can be relaxed for permissive memory
 	;; models.
 	;; Same applies to other atomic_* patterns.
-	(unspec_volatile:GPR [(match_operand:GPR 2 "memory_operand" "+ZR,ZR")
+	(unspec_volatile:GPR [(match_operand:GPR 2 "memory_operand" "+ZC,ZC")
 			      (match_operand:GPR 3 "reg_or_0_operand" "dJ,dJ")]
 	 UNSPEC_ATOMIC_COMPARE_AND_SWAP))
    (set (match_operand:GPR 1 "register_operand" "=&d,&d")
@@ -629,7 +629,7 @@
 
 (define_insn "atomic_exchange<mode>_llsc"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(unspec_volatile:GPR [(match_operand:GPR 1 "memory_operand" "+ZR,ZR")]
+	(unspec_volatile:GPR [(match_operand:GPR 1 "memory_operand" "+ZC,ZC")]
 	 UNSPEC_ATOMIC_EXCHANGE))
    (set (match_dup 1)
 	(unspec_volatile:GPR [(match_operand:GPR 2 "arith_operand" "I,d")]
@@ -684,7 +684,7 @@
 
 (define_insn "atomic_fetch_add<mode>_llsc"
   [(set (match_operand:GPR 0 "register_operand" "=&d,&d")
-	(unspec_volatile:GPR [(match_operand:GPR 1 "memory_operand" "+ZR,ZR")]
+	(unspec_volatile:GPR [(match_operand:GPR 1 "memory_operand" "+ZC,ZC")]
 	 UNSPEC_ATOMIC_FETCH_OP))
    (set (match_dup 1)
 	(unspec_volatile:GPR
diff --git a/gcc/config/mips/t-img-elf b/gcc/config/mips/t-img-elf
new file mode 100644
index 00000000000..cc5dabbc075
--- /dev/null
+++ b/gcc/config/mips/t-img-elf
@@ -0,0 +1,38 @@
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# The default build is mips32r6, hard-float big-endian.
+# A multilib for mips32r6+LE
+# A multilib for mips64r6
+# A multilib for mips64r6+LE
+# A multilib for mips32r6+LE+singlefloat+shortdouble
+
+MULTILIB_OPTIONS = mips64r6 mabi=64 EL msoft-float/msingle-float fshort-double
+MULTILIB_DIRNAMES = mips64r6 64 el sof sgl short
+MULTILIB_MATCHES = EL=mel EB=meb
+
+# Don't build 64r6 with single-float
+MULTILIB_EXCEPTIONS += mips64r6/*msingle-float*
+MULTILIB_EXCEPTIONS += mips64r6/*fshort-double*
+
+MULTILIB_EXCEPTIONS += mabi=64*
+MULTILIB_EXCEPTIONS += msingle-float*
+MULTILIB_EXCEPTIONS += *msingle-float
+MULTILIB_EXCEPTIONS += fshort-double
+MULTILIB_EXCEPTIONS += EL/fshort-double
+MULTILIB_EXCEPTIONS += *msoft-float/fshort-double
diff --git a/gcc/config/mips/t-img-linux b/gcc/config/mips/t-img-linux
new file mode 100644
index 00000000000..5dbfbe45ef7
--- /dev/null
+++ b/gcc/config/mips/t-img-linux
@@ -0,0 +1,30 @@
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# The default build is mips32r6, hard-float big-endian.  Add mips64r6,
+# 64-bit ABI and little-endian variations.
+
+MULTILIB_OPTIONS = mips64r6 mabi=64 EL
+MULTILIB_DIRNAMES = mips64r6 64 el
+MULTILIB_MATCHES = EL=mel EB=meb
+
+# The 64 bit ABI is not supported on the mips32r6 architecture.
+# Because mips32r6 is the default we can't use that flag to trigger
+# the exception so we check for mabi=64 with no specific mips
+# architecture flag instead.
+MULTILIB_EXCEPTIONS += mabi=64*
diff --git a/gcc/config/mips/t-isa3264 b/gcc/config/mips/t-isa3264
index a5e001ef74d..8455745233e 100644
--- a/gcc/config/mips/t-isa3264
+++ b/gcc/config/mips/t-isa3264
@@ -22,12 +22,12 @@ ifneq ($(filter MIPS_ABI_DEFAULT=ABI_EABI,$(tm_defines)),)
 MULTILIB_OPTIONS = msoft-float EL/EB mips32/mips32r2/mips64/mips64r2
 MULTILIB_DIRNAMES = soft-float el eb mips32 mips32r2 mips64 mips64r2
 else
-MULTILIB_OPTIONS = msoft-float/mfp64 EL/EB mips32/mips32r2/mips64/mips64r2
-MULTILIB_DIRNAMES = soft-float fp64 el eb mips32 mips32r2 mips64 mips64r2
+MULTILIB_OPTIONS = msoft-float/mfp64 EL/EB mips32/mips32r2/mips32r6/mips64/mips64r2/mips64r6
+MULTILIB_DIRNAMES = soft-float fp64 el eb mips32 mips32r2 mips32r6 mips64 mips64r2 mips64r6
 ifneq ($(filter MIPS_ISA_DEFAULT=33,$(tm_defines)),)
-MULTILIB_EXCLUSIONS = mips32/mfp64 mips64/mfp64 mips64r2/mfp64
+MULTILIB_EXCLUSIONS = mips32/mfp64 mips64/mfp64 mips64r2/mfp64 mips32r6/mfp64 mips64r6/mfp64
 else
 MULTILIB_EXCLUSIONS = !mips32r2/mfp64
 endif
 endif
-MULTILIB_MATCHES = EL=mel EB=meb
+MULTILIB_MATCHES = EL=mel EB=meb mips32r2=mips32r3 mips32r2=mips32r5 mips64r2=mips64r3 mips64r2=mips64r5
diff --git a/gcc/config/mips/t-linux-android b/gcc/config/mips/t-linux-android
index 298cad9d5ac..39f512c8166 100644
--- a/gcc/config/mips/t-linux-android
+++ b/gcc/config/mips/t-linux-android
@@ -1,3 +1,3 @@
-MULTILIB_OPTIONS = mips32r2
-MULTILIB_DIRNAMES = mips-r2
-MULTILIB_EXCLUSIONS :=
+MULTILIB_OPTIONS = mips32r2/mips32r6
+MULTILIB_DIRNAMES = mips-r2 mips-r6
+MULTILIB_OSDIRNAMES = ../libr2 ../libr6
diff --git a/gcc/config/mips/t-linux-android64 b/gcc/config/mips/t-linux-android64
new file mode 100644
index 00000000000..55cab7d6201
--- /dev/null
+++ b/gcc/config/mips/t-linux-android64
@@ -0,0 +1,4 @@
+MULTILIB_OPTIONS = mabi=32 mips32/mips32r2/mips32r6/mips64r2/mips64r6
+MULTILIB_DIRNAMES = 32 mips-r1 mips-r2 mips-r6 mips64-r2 mips64-r6
+MULTILIB_OSDIRNAMES = ../lib ../lib ../libr2 ../libr6 ../lib64r2 ../lib64
+MULTILIB_REQUIRED = mabi=32/mips32 mabi=32/mips32r2 mabi=32/mips32r6 mips64r2 mips64r6
diff --git a/gcc/config/mips/t-mti-elf b/gcc/config/mips/t-mti-elf
index 1109ea71661..75cf0da778b 100644
--- a/gcc/config/mips/t-mti-elf
+++ b/gcc/config/mips/t-mti-elf
@@ -19,9 +19,9 @@
 # The default build is mips32r2, hard-float big-endian.  Add mips32,
 # soft-float, and little-endian variations.
 
-MULTILIB_OPTIONS = mips32/mips64/mips64r2 mips16/mmicromips mabi=64 EL msoft-float/mfp64 mnan=2008
-MULTILIB_DIRNAMES = mips32 mips64 mips64r2 mips16 micromips 64 el sof fp64 nan2008
-MULTILIB_MATCHES = EL=mel EB=meb
+MULTILIB_OPTIONS = mips64r2 mmicromips mabi=64 EL msoft-float mnan=2008
+MULTILIB_DIRNAMES = mips64r2 micromips 64 el sof nan2008
+MULTILIB_MATCHES = EL=mel EB=meb mips32r2=mips32r3 mips32r2=mips32r5 mips64r2=mips64r3 mips64r2=mips64r5
 
 # The 64 bit ABI is not supported on the mips32 architecture.
 MULTILIB_EXCEPTIONS += *mips32*/*mabi=64*
@@ -44,7 +44,9 @@ MULTILIB_EXCEPTIONS += *mmicromips/mabi=64*
 # We do not want nan2008 libraries for soft-float.
 MULTILIB_EXCEPTIONS += *msoft-float*/*mnan=2008*
 
-# -mfp64 libraries are only built for mips32r2 and not in mips16 mode.
-MULTILIB_EXCEPTIONS += *mips32/*mfp64*
-MULTILIB_EXCEPTIONS += *mips64*/*mfp64*
-MULTILIB_EXCEPTIONS += *mips16*/*mfp64*
+# Cutbacks for released build
+MULTILIB_EXCEPTIONS += *mips64*/*mnan=2008*
+MULTILIB_EXCEPTIONS += *micromips/EL
+MULTILIB_EXCEPTIONS += *micromips/msoft-float
+MULTILIB_EXCEPTIONS += *micromips/mnan=2008*
+MULTILIB_EXCEPTIONS += *micromips
diff --git a/gcc/config/mips/t-mti-linux b/gcc/config/mips/t-mti-linux
index 1109ea71661..85060430453 100644
--- a/gcc/config/mips/t-mti-linux
+++ b/gcc/config/mips/t-mti-linux
@@ -19,9 +19,9 @@
 # The default build is mips32r2, hard-float big-endian.  Add mips32,
 # soft-float, and little-endian variations.
 
-MULTILIB_OPTIONS = mips32/mips64/mips64r2 mips16/mmicromips mabi=64 EL msoft-float/mfp64 mnan=2008
-MULTILIB_DIRNAMES = mips32 mips64 mips64r2 mips16 micromips 64 el sof fp64 nan2008
-MULTILIB_MATCHES = EL=mel EB=meb
+MULTILIB_OPTIONS = muclibc mips64r2 mmicromips mabi=64 EL mnan=2008
+MULTILIB_DIRNAMES = uclibc mips64r2 micromips 64 el nan2008
+MULTILIB_MATCHES = EL=mel EB=meb mips32r2=mips32r3 mips32r2=mips32r5 mips64r2=mips64r3 mips64r2=mips64r5
 
 # The 64 bit ABI is not supported on the mips32 architecture.
 MULTILIB_EXCEPTIONS += *mips32*/*mabi=64*
@@ -44,7 +44,11 @@ MULTILIB_EXCEPTIONS += *mmicromips/mabi=64*
 # We do not want nan2008 libraries for soft-float.
 MULTILIB_EXCEPTIONS += *msoft-float*/*mnan=2008*
 
-# -mfp64 libraries are only built for mips32r2 and not in mips16 mode.
-MULTILIB_EXCEPTIONS += *mips32/*mfp64*
-MULTILIB_EXCEPTIONS += *mips64*/*mfp64*
-MULTILIB_EXCEPTIONS += *mips16*/*mfp64*
+# Cutbacks for released build
+MULTILIB_EXCEPTIONS += *uclibc*/*mips64*
+MULTILIB_EXCEPTIONS += *uclibc*/*mabi=64*
+MULTILIB_EXCEPTIONS += *uclibc*/*micromips*
+MULTILIB_EXCEPTIONS += *mips64*/*mnan=2008*
+MULTILIB_EXCEPTIONS += *micromips/EL
+MULTILIB_EXCEPTIONS += *micromips/mnan=2008*
+MULTILIB_EXCEPTIONS += *micromips
diff --git a/gcc/config/mips/t-sde b/gcc/config/mips/t-sde
index 229e3d6442c..c04b5f3473d 100644
--- a/gcc/config/mips/t-sde
+++ b/gcc/config/mips/t-sde
@@ -18,7 +18,7 @@
 
 MULTILIB_OPTIONS = EL/EB mips32/mips32r2/mips64/mips64r2 mips16/mmicromips msoft-float/mfp64 mcode-readable=no
 MULTILIB_DIRNAMES = el eb mips32 mips32r2 mips64 mips64r2 mips16 micromips sof f64 spram
-MULTILIB_MATCHES = EL=mel EB=meb
+MULTILIB_MATCHES = EL=mel EB=meb mips32r2=mips32r3 mips32r2=mips32r5 mips64r2=mips64r3 mips64r2=mips64r5
 
 # The -mfp64 option is only valid in conjunction with -mips32r2.
 ifneq ($(filter MIPS_ISA_DEFAULT=33,$(tm_defines)),)
diff --git a/gcc/config/mips/t-sdemtk b/gcc/config/mips/t-sdemtk
index 820faa305e8..2c1dea804fe 100644
--- a/gcc/config/mips/t-sdemtk
+++ b/gcc/config/mips/t-sdemtk
@@ -21,6 +21,7 @@
 
 MULTILIB_OPTIONS = EL/EB mips32/mips32r2/mips64/mips64r2 mips16 msoft-float/mno-float/mfp64
 MULTILIB_DIRNAMES = el eb mips32 mips32r2 mips64 mips64r2 mips16 sof nof f64
+MULTILIB_MATCHES = mips32r2=mips32r3 mips32r2=mips32r5 mips64r2=mips64r3 mips64r2=mips64r5
 
 # Remove stdarg.h and stddef.h from USER_H.
 USER_H = $(srcdir)/ginclude/float.h \
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 9b3022d60a0..730e6c8a6a0 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -80,6 +80,8 @@
 #include "cgraph.h"
 #include "target-globals.h"
 #include "real.h"
+#include "context.h"
+#include "tree-pass.h"
 #if TARGET_XCOFF
 #include "xcoffout.h"  /* get declarations of xcoff_*_section_name */
 #endif
@@ -1172,6 +1174,7 @@ static bool rs6000_secondary_reload_move (enum rs6000_reg_type,
 					  enum machine_mode,
 					  secondary_reload_info *,
 					  bool);
+rtl_opt_pass *make_pass_analyze_swaps (gcc::context*);
 
 /* Hash table stuff for keeping track of TOC entries.  */
 
@@ -1542,17 +1545,6 @@ static const struct attribute_spec rs6000_attribute_table[] =
 #define TARGET_STACK_PROTECT_FAIL rs6000_stack_protect_fail
 #endif
 
-/* MPC604EUM 3.5.2 Weak Consistency between Multiple Processors
-   The PowerPC architecture requires only weak consistency among
-   processors--that is, memory accesses between processors need not be
-   sequentially consistent and memory accesses among processors can occur
-   in any order. The ability to order memory accesses weakly provides
-   opportunities for more efficient use of the system bus. Unless a
-   dependency exists, the 604e allows read operations to precede store
-   operations.  */
-#undef TARGET_RELAXED_ORDERING
-#define TARGET_RELAXED_ORDERING true
-
 #ifdef HAVE_AS_TLS
 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
 #define TARGET_ASM_OUTPUT_DWARF_DTPREL rs6000_output_dwarf_dtprel
@@ -4085,6 +4077,15 @@ static void
 rs6000_option_override (void)
 {
   (void) rs6000_option_override_internal (true);
+
+  /* Register machine-specific passes.  This needs to be done at start-up.
+     It's convenient to do it here (like i386 does).  */
+  opt_pass *pass_analyze_swaps = make_pass_analyze_swaps (g);
+
+  static struct register_pass_info analyze_swaps_info
+    = { pass_analyze_swaps, "cse1", 1, PASS_POS_INSERT_BEFORE };
+
+  register_pass (&analyze_swaps_info);
 }
 
 
@@ -33096,7 +33097,1189 @@ emit_fusion_gpr_load (rtx target, rtx mem)
 
   return "";
 }
+
+/* Analyze vector computations and remove unnecessary doubleword
+   swaps (xxswapdi instructions).  This pass is performed only
+   for little-endian VSX code generation.
+
+   For this specific case, loads and stores of 4x32 and 2x64 vectors
+   are inefficient.  These are implemented using the lvx2dx and
+   stvx2dx instructions, which invert the order of doublewords in
+   a vector register.  Thus the code generation inserts an xxswapdi
+   after each such load, and prior to each such store.  (For spill
+   code after register assignment, an additional xxswapdi is inserted
+   following each store in order to return a hard register to its
+   unpermuted value.)
+
+   The extra xxswapdi instructions reduce performance.  This can be
+   particularly bad for vectorized code.  The purpose of this pass
+   is to reduce the number of xxswapdi instructions required for
+   correctness.
+
+   The primary insight is that much code that operates on vectors
+   does not care about the relative order of elements in a register,
+   so long as the correct memory order is preserved.  If we have
+   a computation where all input values are provided by lvxd2x/xxswapdi
+   sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
+   and all intermediate computations are pure SIMD (independent of
+   element order), then all the xxswapdi's associated with the loads
+   and stores may be removed.
+
+   This pass uses some of the infrastructure and logical ideas from
+   the "web" pass in web.c.  We create maximal webs of computations
+   fitting the description above using union-find.  Each such web is
+   then optimized by removing its unnecessary xxswapdi instructions.
+
+   The pass is placed prior to global optimization so that we can
+   perform the optimization in the safest and simplest way possible;
+   that is, by replacing each xxswapdi insn with a register copy insn.
+   Subsequent forward propagation will remove copies where possible.
+
+   There are some operations sensitive to element order for which we
+   can still allow the operation, provided we modify those operations.
+   These include CONST_VECTORs, for which we must swap the first and
+   second halves of the constant vector; and SUBREGs, for which we
+   must adjust the byte offset to account for the swapped doublewords.
+   A remaining opportunity would be non-immediate-form splats, for
+   which we should adjust the selected lane of the input.  We should
+   also make code generation adjustments for sum-across operations,
+   since this is a common vectorizer reduction.
+
+   Because we run prior to the first split, we can see loads and stores
+   here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
+   vector loads and stores that have not yet been split into a permuting
+   load/store and a swap.  (One way this can happen is with a builtin
+   call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
+   than deleting a swap, we convert the load/store into a permuting
+   load/store (which effectively removes the swap).  */
+
+/* Notes on Permutes
+
+   We do not currently handle computations that contain permutes.  There
+   is a general transformation that can be performed correctly, but it
+   may introduce more expensive code than it replaces.  To handle these
+   would require a cost model to determine when to perform the optimization.
+   This commentary records how this could be done if desired.
+
+   The most general permute is something like this (example for V16QI):
+
+   (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
+                     (parallel [(const_int a0) (const_int a1)
+                                 ...
+                                (const_int a14) (const_int a15)]))
+
+   where a0,...,a15 are in [0,31] and select elements from op1 and op2
+   to produce in the result.
+
+   Regardless of mode, we can convert the PARALLEL to a mask of 16
+   byte-element selectors.  Let's call this M, with M[i] representing
+   the ith byte-element selector value.  Then if we swap doublewords
+   throughout the computation, we can get correct behavior by replacing
+   M with M' as follows:
+
+            { M[i+8]+8 : i < 8, M[i+8] in [0,7] U [16,23]
+    M'[i] = { M[i+8]-8 : i < 8, M[i+8] in [8,15] U [24,31]
+            { M[i-8]+8 : i >= 8, M[i-8] in [0,7] U [16,23]
+            { M[i-8]-8 : i >= 8, M[i-8] in [8,15] U [24,31]
+
+   This seems promising at first, since we are just replacing one mask
+   with another.  But certain masks are preferable to others.  If M
+   is a mask that matches a vmrghh pattern, for example, M' certainly
+   will not.  Instead of a single vmrghh, we would generate a load of
+   M' and a vperm.  So we would need to know how many xxswapd's we can
+   remove as a result of this transformation to determine if it's
+   profitable; and preferably the logic would need to be aware of all
+   the special preferable masks.
+
+   Another form of permute is an UNSPEC_VPERM, in which the mask is
+   already in a register.  In some cases, this mask may be a constant
+   that we can discover with ud-chains, in which case the above
+   transformation is ok.  However, the common usage here is for the
+   mask to be produced by an UNSPEC_LVSL, in which case the mask 
+   cannot be known at compile time.  In such a case we would have to
+   generate several instructions to compute M' as above at run time,
+   and a cost model is needed again.  */
+
+/* This is based on the union-find logic in web.c.  web_entry_base is
+   defined in df.h.  */
+class swap_web_entry : public web_entry_base
+{
+ public:
+  /* Pointer to the insn.  */
+  rtx insn;
+  /* Set if insn contains a mention of a vector register.  All other
+     fields are undefined if this field is unset.  */
+  unsigned int is_relevant : 1;
+  /* Set if insn is a load.  */
+  unsigned int is_load : 1;
+  /* Set if insn is a store.  */
+  unsigned int is_store : 1;
+  /* Set if insn is a doubleword swap.  This can either be a register swap
+     or a permuting load or store (test is_load and is_store for this).  */
+  unsigned int is_swap : 1;
+  /* Set if the insn has a live-in use of a parameter register.  */
+  unsigned int is_live_in : 1;
+  /* Set if the insn has a live-out def of a return register.  */
+  unsigned int is_live_out : 1;
+  /* Set if the insn contains a subreg reference of a vector register.  */
+  unsigned int contains_subreg : 1;
+  /* Set if the insn contains a 128-bit integer operand.  */
+  unsigned int is_128_int : 1;
+  /* Set if this is a call-insn.  */
+  unsigned int is_call : 1;
+  /* Set if this insn does not perform a vector operation for which
+     element order matters, or if we know how to fix it up if it does.
+     Undefined if is_swap is set.  */
+  unsigned int is_swappable : 1;
+  /* A nonzero value indicates what kind of special handling for this
+     insn is required if doublewords are swapped.  Undefined if
+     is_swappable is not set.  */
+  unsigned int special_handling : 3;
+  /* Set if the web represented by this entry cannot be optimized.  */
+  unsigned int web_not_optimizable : 1;
+  /* Set if this insn should be deleted.  */
+  unsigned int will_delete : 1;
+};
+
+enum special_handling_values {
+  SH_NONE = 0,
+  SH_CONST_VECTOR,
+  SH_SUBREG,
+  SH_NOSWAP_LD,
+  SH_NOSWAP_ST,
+  SH_EXTRACT,
+  SH_SPLAT
+};
+
+/* Union INSN with all insns containing definitions that reach USE.
+   Detect whether USE is live-in to the current function.  */
+static void
+union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
+{
+  struct df_link *link = DF_REF_CHAIN (use);
+
+  if (!link)
+    insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+  while (link)
+    {
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+      if (DF_REF_INSN_INFO (link->ref))
+	{
+	  rtx def_insn = DF_REF_INSN (link->ref);
+	  (void)unionfind_union (insn_entry + INSN_UID (insn),
+				 insn_entry + INSN_UID (def_insn));
+	}
+
+      link = link->next;
+    }
+}
+
+/* Union INSN with all insns containing uses reached from DEF.
+   Detect whether DEF is live-out from the current function.  */
+static void
+union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
+{
+  struct df_link *link = DF_REF_CHAIN (def);
+
+  if (!link)
+    insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+  while (link)
+    {
+      /* This could be an eh use or some other artificial use;
+	 we treat these all the same (killing the optimization).  */
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+      if (DF_REF_INSN_INFO (link->ref))
+	{
+	  rtx use_insn = DF_REF_INSN (link->ref);
+	  (void)unionfind_union (insn_entry + INSN_UID (insn),
+				 insn_entry + INSN_UID (use_insn));
+	}
+
+      link = link->next;
+    }
+}
+
+/* Return 1 iff INSN is a load insn, including permuting loads that
+   represent an lvxd2x instruction; else return 0.  */
+static unsigned int
+insn_is_load_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+
+  if (GET_CODE (body) == SET)
+    {
+      if (GET_CODE (SET_SRC (body)) == MEM)
+	return 1;
+
+      if (GET_CODE (SET_SRC (body)) == VEC_SELECT
+	  && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
+	return 1;
+
+      return 0;
+    }
+
+  if (GET_CODE (body) != PARALLEL)
+    return 0;
+
+  rtx set = XVECEXP (body, 0, 0);
+
+  if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
+    return 1;
+
+  return 0;
+}
+
+/* Return 1 iff INSN is a store insn, including permuting stores that
+   represent an stvxd2x instruction; else return 0.  */
+static unsigned int
+insn_is_store_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
+    return 1;
+  if (GET_CODE (body) != PARALLEL)
+    return 0;
+  rtx set = XVECEXP (body, 0, 0);
+  if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
+    return 1;
+  return 0;
+}
+
+/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
+   a permuting load, or a permuting store.  */
+static unsigned int
+insn_is_swap_p (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  if (GET_CODE (body) != SET)
+    return 0;
+  rtx rhs = SET_SRC (body);
+  if (GET_CODE (rhs) != VEC_SELECT)
+    return 0;
+  rtx parallel = XEXP (rhs, 1);
+  if (GET_CODE (parallel) != PARALLEL)
+    return 0;
+  unsigned int len = XVECLEN (parallel, 0);
+  if (len != 2 && len != 4 && len != 8 && len != 16)
+    return 0;
+  for (unsigned int i = 0; i < len / 2; ++i)
+    {
+      rtx op = XVECEXP (parallel, 0, i);
+      if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
+	return 0;
+    }
+  for (unsigned int i = len / 2; i < len; ++i)
+    {
+      rtx op = XVECEXP (parallel, 0, i);
+      if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
+	return 0;
+    }
+  return 1;
+}
+
+/* Return 1 iff OP is an operand that will not be affected by having
+   vector doublewords swapped in memory.  */
+static unsigned int
+rtx_is_swappable_p (rtx op, unsigned int *special)
+{
+  enum rtx_code code = GET_CODE (op);
+  int i, j;
+  rtx parallel;
+
+  switch (code)
+    {
+    case LABEL_REF:
+    case SYMBOL_REF:
+    case CLOBBER:
+    case REG:
+      return 1;
+
+    case VEC_CONCAT:
+    case ASM_INPUT:
+    case ASM_OPERANDS:
+      return 0;
+
+    case CONST_VECTOR:
+      {
+	*special = SH_CONST_VECTOR;
+	return 1;
+      }
+
+    case VEC_DUPLICATE:
+      /* Opportunity: If XEXP (op, 0) has the same mode as the result,
+	 and XEXP (op, 1) is a PARALLEL with a single QImode const int,
+	 it represents a vector splat for which we can do special
+	 handling.  */
+      if (GET_CODE (XEXP (op, 0)) == CONST_INT)
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == REG
+	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+	/* This catches V2DF and V2DI splat, at a minimum.  */
+	return 1;
+      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+	/* If the duplicated item is from a select, defer to the select
+	   processing to see if we can change the lane for the splat.  */
+	return rtx_is_swappable_p (XEXP (op, 0), special);
+      else
+	return 0;
+
+    case VEC_SELECT:
+      /* A vec_extract operation is ok if we change the lane.  */
+      if (GET_CODE (XEXP (op, 0)) == REG
+	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+	  && XVECLEN (parallel, 0) == 1
+	  && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+	{
+	  *special = SH_EXTRACT;
+	  return 1;
+	}
+      else
+	return 0;
+
+    case UNSPEC:
+      {
+	/* Various operations are unsafe for this optimization, at least
+	   without significant additional work.  Permutes are obviously
+	   problematic, as both the permute control vector and the ordering
+	   of the target values are invalidated by doubleword swapping.
+	   Vector pack and unpack modify the number of vector lanes.
+	   Merge-high/low will not operate correctly on swapped operands.
+	   Vector shifts across element boundaries are clearly uncool,
+	   as are vector select and concatenate operations.  Vector
+	   sum-across instructions define one operand with a specific
+	   order-dependent element, so additional fixup code would be
+	   needed to make those work.  Vector set and non-immediate-form
+	   vector splat are element-order sensitive.  A few of these
+	   cases might be workable with special handling if required.  */
+	int val = XINT (op, 1);
+	switch (val)
+	  {
+	  default:
+	    break;
+	  case UNSPEC_VMRGH_DIRECT:
+	  case UNSPEC_VMRGL_DIRECT:
+	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
+	  case UNSPEC_VPACK_SIGN_UNS_SAT:
+	  case UNSPEC_VPACK_UNS_UNS_MOD:
+	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
+	  case UNSPEC_VPACK_UNS_UNS_SAT:
+	  case UNSPEC_VPERM:
+	  case UNSPEC_VPERM_UNS:
+	  case UNSPEC_VPERMHI:
+	  case UNSPEC_VPERMSI:
+	  case UNSPEC_VPKPX:
+	  case UNSPEC_VSLDOI:
+	  case UNSPEC_VSLO:
+	  case UNSPEC_VSRO:
+	  case UNSPEC_VSUM2SWS:
+	  case UNSPEC_VSUM4S:
+	  case UNSPEC_VSUM4UBS:
+	  case UNSPEC_VSUMSWS:
+	  case UNSPEC_VSUMSWS_DIRECT:
+	  case UNSPEC_VSX_CONCAT:
+	  case UNSPEC_VSX_SET:
+	  case UNSPEC_VSX_SLDWI:
+	  case UNSPEC_VUNPACK_HI_SIGN:
+	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
+	  case UNSPEC_VUNPACK_LO_SIGN:
+	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
+	  case UNSPEC_VUPKHPX:
+	  case UNSPEC_VUPKHS_V4SF:
+	  case UNSPEC_VUPKHU_V4SF:
+	  case UNSPEC_VUPKLPX:
+	  case UNSPEC_VUPKLS_V4SF:
+	  case UNSPEC_VUPKLU_V4SF:
+	  /* The following could be handled as an idiom with XXSPLTW.
+	     These place a scalar in BE element zero, but the XXSPLTW
+	     will currently expect it in BE element 2 in a swapped
+	     region.  When one of these feeds an XXSPLTW with no other
+	     defs/uses either way, we can avoid the lane change for
+	     XXSPLTW and things will be correct.  TBD.  */
+	  case UNSPEC_VSX_CVDPSPN:
+	  case UNSPEC_VSX_CVSPDP:
+	  case UNSPEC_VSX_CVSPDPN:
+	    return 0;
+	  case UNSPEC_VSPLT_DIRECT:
+	    *special = SH_SPLAT;
+	    return 1;
+	  }
+      }
+
+    default:
+      break;
+    }
+
+  const char *fmt = GET_RTX_FORMAT (code);
+  int ok = 1;
+
+  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+    if (fmt[i] == 'e' || fmt[i] == 'u')
+      {
+	unsigned int special_op = SH_NONE;
+	ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
+	/* Ensure we never have two kinds of special handling
+	   for the same insn.  */
+	if (*special != SH_NONE && special_op != SH_NONE
+	    && *special != special_op)
+	  return 0;
+	*special = special_op;
+      }
+    else if (fmt[i] == 'E')
+      for (j = 0; j < XVECLEN (op, i); ++j)
+	{
+	  unsigned int special_op = SH_NONE;
+	  ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
+	  /* Ensure we never have two kinds of special handling
+	     for the same insn.  */
+	  if (*special != SH_NONE && special_op != SH_NONE
+	      && *special != special_op)
+	    return 0;
+	  *special = special_op;
+	}
+
+  return ok;
+}
+
+/* Return 1 iff INSN is an operand that will not be affected by
+   having vector doublewords swapped in memory (in which case
+   *SPECIAL is unchanged), or that can be modified to be correct
+   if vector doublewords are swapped in memory (in which case
+   *SPECIAL is changed to a value indicating how).  */
+static unsigned int
+insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
+		     unsigned int *special)
+{
+  /* Calls are always bad.  */
+  if (GET_CODE (insn) == CALL_INSN)
+    return 0;
+
+  /* Loads and stores seen here are not permuting, but we can still
+     fix them up by converting them to permuting ones.  Exceptions:
+     UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
+     body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
+     for the SET source.  */
+  rtx body = PATTERN (insn);
+  int i = INSN_UID (insn);
+
+  if (insn_entry[i].is_load)
+    {
+      if (GET_CODE (body) == SET)
+	{
+	  *special = SH_NOSWAP_LD;
+	  return 1;
+	}
+      else
+	return 0;
+    }
+
+  if (insn_entry[i].is_store)
+    {
+      if (GET_CODE (body) == SET && GET_CODE (SET_SRC (body)) != UNSPEC)
+	{
+	  *special = SH_NOSWAP_ST;
+	  return 1;
+	}
+      else
+	return 0;
+    }
+
+  /* Otherwise check the operands for vector lane violations.  */
+  return rtx_is_swappable_p (body, special);
+}
+
+enum chain_purpose { FOR_LOADS, FOR_STORES };
+
+/* Return true if the UD or DU chain headed by LINK is non-empty,
+   and every entry on the chain references an insn that is a
+   register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
+   register swap must have only permuting loads as reaching defs.
+   If PURPOSE is FOR_STORES, each such register swap must have only
+   register swaps or permuting stores as reached uses.  */
+static bool
+chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
+			   enum chain_purpose purpose)
+{
+  if (!link)
+    return false;
+
+  for (; link; link = link->next)
+    {
+      if (!VECTOR_MODE_P (GET_MODE (DF_REF_REG (link->ref))))
+	continue;
+
+      if (DF_REF_IS_ARTIFICIAL (link->ref))
+	return false;
+
+      rtx reached_insn = DF_REF_INSN (link->ref);
+      unsigned uid = INSN_UID (reached_insn);
+
+      if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
+	  || insn_entry[uid].is_store)
+	return false;
+
+      if (purpose == FOR_LOADS)
+	{
+	  df_ref *use_rec;
+	  for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+	    {
+	      df_ref use = *use_rec;
+	      struct df_link *swap_link = DF_REF_CHAIN (use);
+
+	      while (swap_link)
+		{
+		  if (DF_REF_IS_ARTIFICIAL (link->ref))
+		    return false;
+
+		  rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
+		  unsigned uid2 = INSN_UID (swap_def_insn);
+
+		  /* Only permuting loads are allowed.  */
+		  if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
+		    return false;
+
+		  swap_link = swap_link->next;
+		}
+	    }
+	}
+      else if (purpose == FOR_STORES)
+	{
+	  df_ref *def_rec;
+	  for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+	    {
+	      df_ref def = *def_rec;
+	      struct df_link *swap_link = DF_REF_CHAIN (def);
+
+	      while (swap_link)
+		{
+		  if (DF_REF_IS_ARTIFICIAL (link->ref))
+		    return false;
+
+		  rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
+		  unsigned uid2 = INSN_UID (swap_use_insn);
+
+		  /* Permuting stores or register swaps are allowed.  */
+		  if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
+		    return false;
+
+		  swap_link = swap_link->next;
+		}
+	    }
+	}
+    }
+
+  return true;
+}
+
+/* Mark the xxswapdi instructions associated with permuting loads and
+   stores for removal.  Note that we only flag them for deletion here,
+   as there is a possibility of a swap being reached from multiple
+   loads, etc.  */
+static void
+mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
+{
+  rtx insn = insn_entry[i].insn;
+  unsigned uid = INSN_UID (insn);
+
+  if (insn_entry[i].is_load)
+    {
+      df_ref *def_rec;
+      for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+	{
+	  df_ref def = *def_rec;
+	  struct df_link *link = DF_REF_CHAIN (def);
+
+	  /* We know by now that these are swaps, so we can delete
+	     them confidently.  */
+	  while (link)
+	    {
+	      rtx use_insn = DF_REF_INSN (link->ref);
+	      insn_entry[INSN_UID (use_insn)].will_delete = 1;
+	      link = link->next;
+	    }
+	}
+    }
+  else if (insn_entry[i].is_store)
+    {
+      df_ref *use_rec;
+      for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+	{
+	  df_ref use = *use_rec;
+	  /* Ignore uses for addressability.  */
+	  machine_mode mode = GET_MODE (DF_REF_REG (use));
+	  if (!VECTOR_MODE_P (mode))
+	    continue;
+
+	  struct df_link *link = DF_REF_CHAIN (use);
+
+	  /* We know by now that these are swaps, so we can delete
+	     them confidently.  */
+	  while (link)
+	    {
+	      rtx def_insn = DF_REF_INSN (link->ref);
+	      insn_entry[INSN_UID (def_insn)].will_delete = 1;
+	      link = link->next;
+	    }
+	}
+    }
+}
+
+/* OP is either a CONST_VECTOR or an expression containing one.
+   Swap the first half of the vector with the second in the first
+   case.  Recurse to find it in the second.  */
+static void
+swap_const_vector_halves (rtx op)
+{
+  int i;
+  enum rtx_code code = GET_CODE (op);
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
+      for (i = 0; i < half_units; ++i)
+	{
+	  rtx temp = CONST_VECTOR_ELT (op, i);
+	  CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
+	  CONST_VECTOR_ELT (op, i + half_units) = temp;
+	}
+    }
+  else
+    {
+      int j;
+      const char *fmt = GET_RTX_FORMAT (code);
+      for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+	if (fmt[i] == 'e' || fmt[i] == 'u')
+	  swap_const_vector_halves (XEXP (op, i));
+	else if (fmt[i] == 'E')
+	  for (j = 0; j < XVECLEN (op, i); ++j)
+	    swap_const_vector_halves (XVECEXP (op, i, j));
+    }
+}
+
+/* Find all subregs of a vector expression that perform a narrowing,
+   and adjust the subreg index to account for doubleword swapping.  */
+static void
+adjust_subreg_index (rtx op)
+{
+  enum rtx_code code = GET_CODE (op);
+  if (code == SUBREG
+      && (GET_MODE_SIZE (GET_MODE (op))
+	  < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
+    {
+      unsigned int index = SUBREG_BYTE (op);
+      if (index < 8)
+	index += 8;
+      else
+	index -= 8;
+      SUBREG_BYTE (op) = index;
+    }
+
+  const char *fmt = GET_RTX_FORMAT (code);
+  int i,j;
+  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+    if (fmt[i] == 'e' || fmt[i] == 'u')
+      adjust_subreg_index (XEXP (op, i));
+    else if (fmt[i] == 'E')
+      for (j = 0; j < XVECLEN (op, i); ++j)
+	adjust_subreg_index (XVECEXP (op, i, j));
+}
+
+/* Convert the non-permuting load INSN to a permuting one.  */
+static void
+permute_load (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  rtx mem_op = SET_SRC (body);
+  rtx tgt_reg = SET_DEST (body);
+  machine_mode mode = GET_MODE (tgt_reg);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int half_elts = n_elts / 2;
+  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+  int i, j;
+  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
+  SET_SRC (body) = sel;
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Replacing load %d with permuted load\n",
+	     INSN_UID (insn));
+}
+
+/* Convert the non-permuting store INSN to a permuting one.  */
+static void
+permute_store (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  rtx src_reg = SET_SRC (body);
+  machine_mode mode = GET_MODE (src_reg);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int half_elts = n_elts / 2;
+  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+  int i, j;
+  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+    XVECEXP (par, 0, i) = GEN_INT (j);
+  rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
+  SET_SRC (body) = sel;
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Replacing store %d with permuted store\n",
+	     INSN_UID (insn));
+}
+
+/* Given OP that contains a vector extract operation, adjust the index
+   of the extracted lane to account for the doubleword swap.  */
+static void
+adjust_extract (rtx insn)
+{
+  rtx src = SET_SRC (PATTERN (insn));
+  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+     account for that.  */
+  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
+  rtx par = XEXP (sel, 1);
+  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
+  int lane = INTVAL (XVECEXP (par, 0, 0));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (par, 0, 0) = GEN_INT (lane);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a vector direct-splat operation, adjust the index
+   of the source lane to account for the doubleword swap.  */
+static void
+adjust_splat (rtx insn)
+{
+  rtx body = PATTERN (insn);
+  rtx unspec = XEXP (body, 1);
+  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
+  int lane = INTVAL (XVECEXP (unspec, 0, 1));
+  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
+  INSN_CODE (insn) = -1; /* Force re-recognition.  */
+  df_insn_rescan (insn);
+
+  if (dump_file)
+    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
+}
+
+/* The insn described by INSN_ENTRY[I] can be swapped, but only
+   with special handling.  Take care of that here.  */
+static void
+handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
+{
+  rtx insn = insn_entry[i].insn;
+  rtx body = PATTERN (insn);
+
+  switch (insn_entry[i].special_handling)
+    {
+    default:
+      gcc_unreachable ();
+    case SH_CONST_VECTOR:
+      {
+	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
+	gcc_assert (GET_CODE (body) == SET);
+	rtx rhs = SET_SRC (body);
+	swap_const_vector_halves (rhs);
+	if (dump_file)
+	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
+	break;
+      }
+    case SH_SUBREG:
+      /* A subreg of the same size is already safe.  For subregs that
+	 select a smaller portion of a reg, adjust the index for
+	 swapped doublewords.  */
+      adjust_subreg_index (body);
+      if (dump_file)
+	fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
+      break;
+    case SH_NOSWAP_LD:
+      /* Convert a non-permuting load to a permuting one.  */
+      permute_load (insn);
+      break;
+    case SH_NOSWAP_ST:
+      /* Convert a non-permuting store to a permuting one.  */
+      permute_store (insn);
+      break;
+    case SH_EXTRACT:
+      /* Change the lane on an extract operation.  */
+      adjust_extract (insn);
+      break;
+    case SH_SPLAT:
+      /* Change the lane on a direct-splat operation.  */
+      adjust_splat (insn);
+      break;
+    }
+}
+
+/* Find the insn from the Ith table entry, which is known to be a
+   register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
+static void
+replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
+{
+  rtx insn = insn_entry[i].insn;
+  rtx body = PATTERN (insn);
+  rtx src_reg = XEXP (SET_SRC (body), 0);
+  rtx copy = gen_rtx_SET (VOIDmode, SET_DEST (body), src_reg);
+  rtx new_insn = emit_insn_before (copy, insn);
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+  df_insn_rescan (new_insn);
+
+  if (dump_file)
+    {
+      unsigned int new_uid = INSN_UID (new_insn);
+      fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
+    }
+
+  df_insn_delete (insn);
+  remove_insn (insn);
+  INSN_DELETED_P (insn) = 1;
+}
+
+/* Dump the swap table to DUMP_FILE.  */
+static void
+dump_swap_insn_table (swap_web_entry *insn_entry)
+{
+  int e = get_max_uid ();
+  fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
+
+  for (int i = 0; i < e; ++i)
+    if (insn_entry[i].is_relevant)
+      {
+	swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
+	fprintf (dump_file, "%6d %6d  ", i,
+		 pred_entry && pred_entry->insn
+		 ? INSN_UID (pred_entry->insn) : 0);
+	if (insn_entry[i].is_load)
+	  fputs ("load ", dump_file);
+	if (insn_entry[i].is_store)
+	  fputs ("store ", dump_file);
+	if (insn_entry[i].is_swap)
+	  fputs ("swap ", dump_file);
+	if (insn_entry[i].is_live_in)
+	  fputs ("live-in ", dump_file);
+	if (insn_entry[i].is_live_out)
+	  fputs ("live-out ", dump_file);
+	if (insn_entry[i].contains_subreg)
+	  fputs ("subreg ", dump_file);
+	if (insn_entry[i].is_128_int)
+	  fputs ("int128 ", dump_file);
+	if (insn_entry[i].is_call)
+	  fputs ("call ", dump_file);
+	if (insn_entry[i].is_swappable)
+	  {
+	    fputs ("swappable ", dump_file);
+	    if (insn_entry[i].special_handling == SH_CONST_VECTOR)
+	      fputs ("special:constvec ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_SUBREG)
+	      fputs ("special:subreg ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
+	      fputs ("special:load ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
+	      fputs ("special:store ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_EXTRACT)
+	      fputs ("special:extract ", dump_file);
+	    else if (insn_entry[i].special_handling == SH_SPLAT)
+	      fputs ("special:splat ", dump_file);
+	  }
+	if (insn_entry[i].web_not_optimizable)
+	  fputs ("unoptimizable ", dump_file);
+	if (insn_entry[i].will_delete)
+	  fputs ("delete ", dump_file);
+	fputs ("\n", dump_file);
+      }
+  fputs ("\n", dump_file);
+}
+
+/* Main entry point for this pass.  */
+unsigned int
+rs6000_analyze_swaps (function *fun)
+{
+  swap_web_entry *insn_entry;
+  basic_block bb;
+  rtx insn;
+
+  /* Dataflow analysis for use-def chains.  */
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  /* Allocate structure to represent webs of insns.  */
+  insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
+
+  /* Walk the insns to gather basic data.  */
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS (bb, insn)
+    {
+      unsigned int uid = INSN_UID (insn);
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  insn_entry[uid].insn = insn;
+
+	  if (GET_CODE (insn) == CALL_INSN)
+	    insn_entry[uid].is_call = 1;
+
+	  /* Walk the uses and defs to see if we mention vector regs.
+	     Record any constraints on optimization of such mentions.  */
+	  df_ref *use_rec;
+	  for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+	    {
+	      df_ref mention = *use_rec;
+	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
+	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+	      /* If a use gets its value from a call insn, it will be
+		 a hard register and will look like (reg:V4SI 3 3).
+		 The df analysis creates two mentions for GPR3 and GPR4,
+		 both DImode.  We must recognize this and treat it as a
+		 vector mention to ensure the call is unioned with this
+		 use.  */
+	      if (mode == DImode && DF_REF_INSN_INFO (mention))
+		{
+		  rtx feeder = DF_REF_INSN (mention);
+		  /* FIXME:  It is pretty hard to get from the df mention
+		     to the mode of the use in the insn.  We arbitrarily
+		     pick a vector mode here, even though the use might
+		     be a real DImode.  We can be too conservative
+		     (create a web larger than necessary) because of
+		     this, so consider eventually fixing this.  */
+		  if (GET_CODE (feeder) == CALL_INSN)
+		    mode = V4SImode;
+		}
+
+	      if (VECTOR_MODE_P (mode) || mode == TImode)
+		{
+		  insn_entry[uid].is_relevant = 1;
+		  if (mode == TImode || mode == V1TImode)
+		    insn_entry[uid].is_128_int = 1;
+		  if (DF_REF_INSN_INFO (mention))
+		    insn_entry[uid].contains_subreg
+		      = !rtx_equal_p (DF_REF_REG (mention),
+				      DF_REF_REAL_REG (mention));
+		  union_defs (insn_entry, insn, mention);
+		}
+	    }
+	  df_ref *def_rec;
+	  for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+	    {
+	      df_ref mention = *def_rec;
+	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
+	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+	      /* If we're loading up a hard vector register for a call,
+		 it looks like (set (reg:V4SI 9 9) (...)).  The df
+		 analysis creates two mentions for GPR9 and GPR10, both
+		 DImode.  So relying on the mode from the mentions
+		 isn't sufficient to ensure we union the call into the
+		 web with the parameter setup code.  */
+	      if (mode == DImode && GET_CODE (insn) == SET
+		  && VECTOR_MODE_P (GET_MODE (SET_DEST (insn))))
+		mode = GET_MODE (SET_DEST (insn));
+
+	      if (VECTOR_MODE_P (mode) || mode == TImode)
+		{
+		  insn_entry[uid].is_relevant = 1;
+		  if (mode == TImode || mode == V1TImode)
+		    insn_entry[uid].is_128_int = 1;
+		  if (DF_REF_INSN_INFO (mention))
+		    insn_entry[uid].contains_subreg
+		      = !rtx_equal_p (DF_REF_REG (mention),
+				      DF_REF_REAL_REG (mention));
+		  /* REG_FUNCTION_VALUE_P is not valid for subregs. */
+		  else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
+		    insn_entry[uid].is_live_out = 1;
+		  union_uses (insn_entry, insn, mention);
+		}
+	    }
+
+	  if (insn_entry[uid].is_relevant)
+	    {
+	      /* Determine if this is a load or store.  */
+	      insn_entry[uid].is_load = insn_is_load_p (insn);
+	      insn_entry[uid].is_store = insn_is_store_p (insn);
+
+	      /* Determine if this is a doubleword swap.  If not,
+		 determine whether it can legally be swapped.  */
+	      if (insn_is_swap_p (insn))
+		insn_entry[uid].is_swap = 1;
+	      else
+		{
+		  unsigned int special = SH_NONE;
+		  insn_entry[uid].is_swappable
+		    = insn_is_swappable_p (insn_entry, insn, &special);
+		  if (special != SH_NONE && insn_entry[uid].contains_subreg)
+		    insn_entry[uid].is_swappable = 0;
+		  else if (special != SH_NONE)
+		    insn_entry[uid].special_handling = special;
+		  else if (insn_entry[uid].contains_subreg)
+		    insn_entry[uid].special_handling = SH_SUBREG;
+		}
+	    }
+	}
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nSwap insn entry table when first built\n");
+      dump_swap_insn_table (insn_entry);
+    }
+
+  /* Record unoptimizable webs.  */
+  unsigned e = get_max_uid (), i;
+  for (i = 0; i < e; ++i)
+    {
+      if (!insn_entry[i].is_relevant)
+	continue;
+
+      swap_web_entry *root
+	= (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
+      unsigned uid = INSN_UID (insn_entry[i].insn);
+
+      if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
+	  || (insn_entry[i].contains_subreg
+	      && insn_entry[i].special_handling != SH_SUBREG)
+	  || insn_entry[i].is_128_int || insn_entry[i].is_call
+	  || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
+	root->web_not_optimizable = 1;
+
+      /* If we have loads or stores that aren't permuting then the
+	 optimization isn't appropriate.  */
+      else if ((insn_entry[i].is_load || insn_entry[i].is_store)
+	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
+	root->web_not_optimizable = 1;
+
+      /* If we have permuting loads or stores that are not accompanied
+	 by a register swap, the optimization isn't appropriate.  */
+      else if (insn_entry[i].is_load && insn_entry[i].is_swap)
+	{
+	  df_ref *def_rec;
+
+	  for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+	    {
+	      df_ref def = *def_rec;
+	      struct df_link *link = DF_REF_CHAIN (def);
+
+	      if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
+		{
+		  root->web_not_optimizable = 1;
+		  break;
+		}
+	    }
+	}
+      else if (insn_entry[i].is_store && insn_entry[i].is_swap)
+	{
+	  df_ref *use_rec;
+
+	  for (use_rec = DF_INSN_UID_USES (uid); *use_rec; use_rec++)
+	    {
+	      df_ref use = *use_rec;
+	      struct df_link *link = DF_REF_CHAIN (use);
+
+	      if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
+		{
+		  root->web_not_optimizable = 1;
+		  break;
+		}
+	    }
+	}
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
+      dump_swap_insn_table (insn_entry);
+    }
+
+  /* For each load and store in an optimizable web (which implies
+     the loads and stores are permuting), find the associated
+     register swaps and mark them for removal.  Due to various
+     optimizations we may mark the same swap more than once.  Also
+     perform special handling for swappable insns that require it.  */
+  for (i = 0; i < e; ++i)
+    if ((insn_entry[i].is_load || insn_entry[i].is_store)
+	&& insn_entry[i].is_swap)
+      {
+	swap_web_entry* root_entry
+	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+	if (!root_entry->web_not_optimizable)
+	  mark_swaps_for_removal (insn_entry, i);
+      }
+    else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
+      {
+	swap_web_entry* root_entry
+	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+	if (!root_entry->web_not_optimizable)
+	  handle_special_swappables (insn_entry, i);
+      }
+
+  /* Now delete the swaps marked for removal.  */
+  for (i = 0; i < e; ++i)
+    if (insn_entry[i].will_delete)
+      replace_swap_with_copy (insn_entry, i);
+
+  /* Clean up.  */
+  free (insn_entry);
+  return 0;
+}
+
+const pass_data pass_data_analyze_swaps =
+{
+  RTL_PASS, /* type */
+  "swaps", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  true, /* has_gate */
+  true, /* has_execute */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
 
+class pass_analyze_swaps : public rtl_opt_pass
+{
+public:
+  pass_analyze_swaps(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate ()
+    {
+      return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
+	      && rs6000_optimize_swaps);
+    }
+
+  unsigned int execute ()
+    {
+      return rs6000_analyze_swaps (cfun);
+    }
+
+}; // class pass_analyze_swaps
+
+rtl_opt_pass *
+make_pass_analyze_swaps (gcc::context *ctxt)
+{
+  return new pass_analyze_swaps (ctxt);
+}
 
 struct gcc_target targetm = TARGET_INITIALIZER;
 
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 4c1a02a524a..4d0d5e73db1 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -588,3 +588,7 @@ Allow double variables in upper registers with -mcpu=power7 or -mvsx
 mupper-regs-sf
 Target Undocumented Mask(UPPER_REGS_SF) Var(rs6000_isa_flags)
 Allow float variables in upper registers with -mcpu=power8 or -mp8-vector
+
+moptimize-swaps
+Target Undocumented Var(rs6000_optimize_swaps) Init(1) Save
+Analyze and remove doubleword swaps from VSX computations.
diff --git a/gcc/config/sparc/linux.h b/gcc/config/sparc/linux.h
index c54ba2cb51c..c40bb0b78e2 100644
--- a/gcc/config/sparc/linux.h
+++ b/gcc/config/sparc/linux.h
@@ -147,12 +147,6 @@ do {									\
 /* Static stack checking is supported by means of probes.  */
 #define STACK_CHECK_STATIC_BUILTIN 1
 
-/* Linux currently uses RMO in uniprocessor mode, which is equivalent to
-   TMO, and TMO in multiprocessor mode.  But they reserve the right to
-   change their minds.  */
-#undef SPARC_RELAXED_ORDERING
-#define SPARC_RELAXED_ORDERING true
-
 #undef NEED_INDICATE_EXEC_STACK
 #define NEED_INDICATE_EXEC_STACK 1
 
diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h
index f00fb42ffab..12bb3780ba7 100644
--- a/gcc/config/sparc/linux64.h
+++ b/gcc/config/sparc/linux64.h
@@ -261,12 +261,6 @@ do {									\
 /* Static stack checking is supported by means of probes.  */
 #define STACK_CHECK_STATIC_BUILTIN 1
 
-/* Linux currently uses RMO in uniprocessor mode, which is equivalent to
-   TMO, and TMO in multiprocessor mode.  But they reserve the right to
-   change their minds.  */
-#undef SPARC_RELAXED_ORDERING
-#define SPARC_RELAXED_ORDERING true
-
 #undef NEED_INDICATE_EXEC_STACK
 #define NEED_INDICATE_EXEC_STACK 1
 
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index d00c7b6fef5..f7fc957b48c 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -786,9 +786,6 @@ char sparc_hard_reg_printed[8];
 #define TARGET_ATTRIBUTE_TABLE sparc_attribute_table
 #endif
 
-#undef TARGET_RELAXED_ORDERING
-#define TARGET_RELAXED_ORDERING SPARC_RELAXED_ORDERING
-
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE sparc_option_override
 
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index 79dbba22d09..87f1d82d650 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -106,17 +106,6 @@ extern enum cmodel sparc_cmodel;
 
 #define SPARC_DEFAULT_CMODEL CM_32
 
-/* The SPARC-V9 architecture defines a relaxed memory ordering model (RMO)
-   which requires the following macro to be true if enabled.  Prior to V9,
-   there are no instructions to even talk about memory synchronization.
-   Note that the UltraSPARC III processors don't implement RMO, unlike the
-   UltraSPARC II processors.  Niagara, Niagara-2, and Niagara-3 do not
-   implement RMO either.
-
-   Default to false; for example, Solaris never enables RMO, only ever uses
-   total memory ordering (TMO).  */
-#define SPARC_RELAXED_ORDERING false
-
 /* Do not use the .note.GNU-stack convention by default.  */
 #define NEED_INDICATE_EXEC_STACK 0
 
diff --git a/gcc/configure b/gcc/configure
index 1a58dc12abf..0db46a3c6bc 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -26199,6 +26199,41 @@ $as_echo "#define HAVE_AS_GNU_ATTRIBUTE 1" >>confdefs.h
 
 fi
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for .module support" >&5
+$as_echo_n "checking assembler for .module support... " >&6; }
+if test "${gcc_cv_as_mips_dot_module+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_mips_dot_module=no
+  if test x$gcc_cv_as != x; then
+    $as_echo '.module fp=32' > conftest.s
+    if { ac_try='$gcc_cv_as $gcc_cv_as_flags  -o conftest.o conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+    then
+	gcc_cv_as_mips_dot_module=yes
+    else
+      echo "configure: failed program was" >&5
+      cat conftest.s >&5
+    fi
+    rm -f conftest.o conftest.s
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_mips_dot_module" >&5
+$as_echo "$gcc_cv_as_mips_dot_module" >&6; }
+if test $gcc_cv_as_mips_dot_module = yes; then
+
+$as_echo "#define HAVE_AS_DOT_MODULE 1" >>confdefs.h
+
+fi
+    if test x$gcc_cv_as_mips_dot_module = xno \
+       && test x$with_fp_32 != x; then
+      as_fn_error "Requesting --with-fp-32= requires assembler support for .module." "$LINENO" 5
+    fi
+
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for .micromips support" >&5
 $as_echo_n "checking assembler for .micromips support... " >&6; }
 if test "${gcc_cv_as_micromips_support+set}" = set; then :
diff --git a/gcc/configure.ac b/gcc/configure.ac
index b25775baaca..6893dfc5b50 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -4272,6 +4272,17 @@ LCF0:
       [AC_DEFINE(HAVE_AS_GNU_ATTRIBUTE, 1,
 	  [Define if your assembler supports .gnu_attribute.])])
 
+    gcc_GAS_CHECK_FEATURE([.module support],
+      gcc_cv_as_mips_dot_module,,,
+      [.module fp=32],,
+      [AC_DEFINE(HAVE_AS_DOT_MODULE, 1,
+	  [Define if your assembler supports .module.])])
+    if test x$gcc_cv_as_mips_dot_module = xno \
+       && test x$with_fp_32 != x; then
+      AC_MSG_ERROR(
+	[Requesting --with-fp-32= requires assembler support for .module.])
+    fi
+
     gcc_GAS_CHECK_FEATURE([.micromips support],
       gcc_cv_as_micromips_support,,[--fatal-warnings],
       [.set micromips],,
diff --git a/gcc/cp/cp-lang.c b/gcc/cp/cp-lang.c
index 22f8e4bbb0e..bd2c2b7bd7a 100644
--- a/gcc/cp/cp-lang.c
+++ b/gcc/cp/cp-lang.c
@@ -109,8 +109,8 @@ static tree get_template_argument_pack_elems_folded (const_tree);
 #define LANG_HOOKS_COPY_LANG_TYPE cp_lipo_copy_lang_type
 #undef LANG_HOOKS_PROCESS_PENDING_DECLS
 #define LANG_HOOKS_PROCESS_PENDING_DECLS cp_process_pending_declarations
-#undef LANG_HOOKS_CLEAR_DEFFERED_FNS
-#define LANG_HOOKS_CLEAR_DEFFERED_FNS cp_clear_deferred_fns
+#undef LANG_HOOKS_RESET_PARSING_STATE
+#define LANG_HOOKS_RESET_PARSING_STATE cp_reset_parsing_state
 #undef LANG_HOOKS_IS_GENERATED_TYPE
 #define LANG_HOOKS_IS_GENERATED_TYPE cp_is_compiler_generated_type
 #undef LANG_HOOKS_CMP_LANG_TYPE
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 725eddc3fe9..793c8481dca 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -4385,6 +4385,7 @@ extern int cp_unevaluated_operand;
 extern tree cp_convert_range_for (tree, tree, tree, bool);
 extern bool parsing_nsdmi (void);
 extern void inject_this_parameter (tree, cp_cv_quals);
+extern void clear_lambda_scope (void);
 
 /* in pt.c  */
 
@@ -5342,7 +5343,7 @@ extern void cplus_decl_attributes		(tree *, tree, int);
 extern void finish_anon_union			(tree);
 extern void cp_write_global_declarations	(void);
 extern void cp_process_pending_declarations     (location_t);
-extern void cp_clear_deferred_fns               (void);
+extern void cp_reset_parsing_state              (void);
 extern void cp_clear_constexpr_hashtable        (void);
 extern void cp_clear_conv_type_map              (void);
 extern tree coerce_new_type			(tree);
@@ -5365,7 +5366,7 @@ extern bool mark_used			        (tree, tsubst_flags_t);
 extern void finish_static_data_member_decl	(tree, tree, bool, tree, int);
 extern tree cp_build_parm_decl			(tree, tree);
 extern tree get_guard				(tree);
-extern tree get_guard_cond			(tree);
+extern tree get_guard_cond			(tree, bool);
 extern tree set_guard				(tree);
 extern tree get_tls_wrapper_fn			(tree);
 extern void mark_needed				(tree);
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 30e3afe81f4..50faaebc65f 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -7035,7 +7035,7 @@ expand_static_init (tree decl, tree init)
 	 looks like:
 
 	   static <type> guard;
-	   if (!guard.first_byte) {
+	   if (!__atomic_load (guard.first_byte)) {
 	     if (__cxa_guard_acquire (&guard)) {
 	       bool flag = false;
 	       try {
@@ -7065,16 +7065,11 @@ expand_static_init (tree decl, tree init)
       /* Create the guard variable.  */
       guard = get_guard (decl);
 
-      /* This optimization isn't safe on targets with relaxed memory
-	 consistency.  On such targets we force synchronization in
-	 __cxa_guard_acquire.  */
-      if (!targetm.relaxed_ordering || !thread_guard)
-	{
-	  /* Begin the conditional initialization.  */
-	  if_stmt = begin_if_stmt ();
-	  finish_if_stmt_cond (get_guard_cond (guard), if_stmt);
-	  then_clause = begin_compound_stmt (BCS_NO_SCOPE);
-	}
+      /* Begin the conditional initialization.  */
+      if_stmt = begin_if_stmt ();
+
+      finish_if_stmt_cond (get_guard_cond (guard, thread_guard), if_stmt);
+      then_clause = begin_compound_stmt (BCS_NO_SCOPE);
 
       if (thread_guard)
 	{
@@ -7143,12 +7138,9 @@ expand_static_init (tree decl, tree init)
 	  finish_if_stmt (inner_if_stmt);
 	}
 
-      if (!targetm.relaxed_ordering || !thread_guard)
-	{
-	  finish_compound_stmt (then_clause);
-	  finish_then_clause (if_stmt);
-	  finish_if_stmt (if_stmt);
-	}
+      finish_compound_stmt (then_clause);
+      finish_then_clause (if_stmt);
+      finish_if_stmt (if_stmt);
     }
   else if (DECL_THREAD_LOCAL_P (decl))
     tls_aggregates = tree_cons (init, decl, tls_aggregates);
diff --git a/gcc/cp/decl2.c b/gcc/cp/decl2.c
index 651320affaa..ff0d3044688 100644
--- a/gcc/cp/decl2.c
+++ b/gcc/cp/decl2.c
@@ -2916,6 +2916,27 @@ get_guard (tree decl)
   return guard;
 }
 
+/* Return an atomic load of src with the appropriate memory model.  */
+
+static tree
+build_atomic_load_byte (tree src, HOST_WIDE_INT model)
+{
+  tree ptr_type = build_pointer_type (char_type_node);
+  tree mem_model = build_int_cst (integer_type_node, model);
+  tree t, addr, val;
+  unsigned int size;
+  int fncode;
+
+  size = tree_to_uhwi (TYPE_SIZE_UNIT (char_type_node));
+
+  fncode = BUILT_IN_ATOMIC_LOAD_N + exact_log2 (size) + 1;
+  t = builtin_decl_implicit ((enum built_in_function) fncode);
+
+  addr = build1 (ADDR_EXPR, ptr_type, src);
+  val = build_call_expr (t, 2, addr, mem_model);
+  return val;
+}
+
 /* Return those bits of the GUARD variable that should be set when the
    guarded entity is actually initialized.  */
 
@@ -2942,12 +2963,14 @@ get_guard_bits (tree guard)
    variable has already been initialized.  */
 
 tree
-get_guard_cond (tree guard)
+get_guard_cond (tree guard, bool thread_safe)
 {
   tree guard_value;
 
-  /* Check to see if the GUARD is zero.  */
-  guard = get_guard_bits (guard);
+  if (!thread_safe)
+    guard = get_guard_bits (guard);
+  else
+    guard = build_atomic_load_byte (guard, MEMMODEL_ACQUIRE);
 
   /* Mask off all but the low bit.  */
   if (targetm.cxx.guard_mask_bit ())
@@ -3033,9 +3056,15 @@ get_local_tls_init_fn (void)
 						  void_list_node));
       SET_DECL_LANGUAGE (fn, lang_c);
       TREE_PUBLIC (fn) = false;
+      TREE_STATIC (fn) = true;
       DECL_ARTIFICIAL (fn) = true;
       mark_used (fn);
       SET_IDENTIFIER_GLOBAL_VALUE (sname, fn);
+      /* In LIPO mode make sure we record the new global value so that it
+         is cleared before parsing the next aux module.  */
+      if (L_IPO_COMP_MODE && !is_parsing_done_p ())
+        add_decl_to_current_module_scope (fn,
+                                          NAMESPACE_LEVEL (global_namespace));
     }
   return fn;
 }
@@ -3100,6 +3129,11 @@ get_tls_init_fn (tree var)
       DECL_BEFRIENDING_CLASSES (fn) = var;
 
       SET_IDENTIFIER_GLOBAL_VALUE (sname, fn);
+      /* In LIPO mode make sure we record the new global value so that it
+         is cleared before parsing the next aux module.  */
+      if (L_IPO_COMP_MODE && !is_parsing_done_p ())
+        add_decl_to_current_module_scope (fn,
+                                          NAMESPACE_LEVEL (global_namespace));
     }
   return fn;
 }
@@ -3157,6 +3191,11 @@ get_tls_wrapper_fn (tree var)
       DECL_BEFRIENDING_CLASSES (fn) = var;
 
       SET_IDENTIFIER_GLOBAL_VALUE (sname, fn);
+      /* In LIPO mode make sure we record the new global value so that it
+         is cleared before parsing the next aux module.  */
+      if (L_IPO_COMP_MODE && !is_parsing_done_p ())
+        add_decl_to_current_module_scope (fn,
+                                          NAMESPACE_LEVEL (global_namespace));
     }
   return fn;
 }
@@ -3562,7 +3601,7 @@ one_static_initialization_or_destruction (tree decl, tree init, bool initp)
 	  /* When using __cxa_atexit, we never try to destroy
 	     anything from a static destructor.  */
 	  gcc_assert (initp);
-	  guard_cond = get_guard_cond (guard);
+	  guard_cond = get_guard_cond (guard, false);
 	}
       /* If we don't have __cxa_atexit, then we will be running
 	 destructors from .fini sections, or their equivalents.  So,
@@ -4139,10 +4178,10 @@ no_linkage_error (tree decl)
 	       "to declare function %q#D with linkage", t, decl);
 }
 
-/* Clear the list of deferred functions.  */
+/* Reset the parsing state for the next module.  */
 
 void
-cp_clear_deferred_fns (void)
+cp_reset_parsing_state (void)
 {
   vec_free (deferred_fns);
   deferred_fns = NULL;
@@ -4153,6 +4192,7 @@ cp_clear_deferred_fns (void)
   clear_pending_templates ();
   reset_anon_name ();
   reset_temp_count ();
+  clear_lambda_scope ();
 }
 
 /* Collect declarations from all namespaces relevant to SOURCE_FILE.  */
@@ -4213,8 +4253,12 @@ handle_tls_init (void)
       one_static_initialization_or_destruction (var, init, true);
 
 #ifdef ASM_OUTPUT_DEF
-      /* Output init aliases even with -fno-extern-tls-init.  */
-      if (TREE_PUBLIC (var))
+      /* Output init aliases even with -fno-extern-tls-init.  Don't emit
+         aliases in LIPO aux modules, since the corresponding __tls_init
+         will be static promoted and deleted, so the variable's tls init
+         function will be resolved by its own primary module.  An alias
+         would prevent the promoted aux __tls_init from being deleted.  */
+      if (TREE_PUBLIC (var) && !L_IPO_IS_AUXILIARY_MODULE)
 	{
           tree single_init_fn = get_tls_init_fn (var);
 	  if (single_init_fn == NULL_TREE)
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 29b590ea45c..aadb36e8a61 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -8705,6 +8705,16 @@ finish_lambda_scope (void)
   lambda_scope_stack->pop ();
 }
 
+void
+clear_lambda_scope (void)
+{
+  if (!lambda_scope_stack)
+    return;
+  gcc_assert(lambda_scope_stack->is_empty());
+  lambda_scope = NULL_TREE;
+  lambda_count = 0;
+}
+
 /* Parse a lambda expression.
 
    lambda-expression:
diff --git a/gcc/df.h b/gcc/df.h
index 2de800c0938..3020955c198 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -1134,21 +1134,23 @@ df_get_artificial_uses (unsigned int bb_index)
 
 /* web */
 
-/* This entry is allocated for each reference in the insn stream.  */
-struct web_entry
+class web_entry_base
 {
-  /* Pointer to the parent in the union/find tree.  */
-  struct web_entry *pred;
-  /* Newly assigned register to the entry.  Set only for roots.  */
-  rtx reg;
-  void* extra_info;
-};
+ private:
+  /* Reference to the parent in the union/find tree.  */
+  web_entry_base *pred_pvt;
+
+ public:
+  /* Accessors.  */
+  web_entry_base *pred () { return pred_pvt; }
+  void set_pred (web_entry_base *p) { pred_pvt = p; }
 
-extern struct web_entry *unionfind_root (struct web_entry *);
-extern bool unionfind_union (struct web_entry *, struct web_entry *);
-extern void union_defs (df_ref, struct web_entry *,
-			unsigned int *used, struct web_entry *,
-			bool (*fun) (struct web_entry *, struct web_entry *));
+  /* Find representative in union-find tree.  */
+  web_entry_base *unionfind_root ();
+
+  /* Union with another set, returning TRUE if they are already unioned.  */
+  friend bool unionfind_union (web_entry_base *first, web_entry_base *second);
+};
 
 extern bool df_check_ud_du_memory_usage (void);
 
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index c42add7f16f..4c0914a3552 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -3527,6 +3527,35 @@ function should not save and restore registers R0..R7.  This can be used on SH3*
 and SH4* targets that have a second R0..R7 register bank for non-reentrant
 interrupt handlers.
 
+@item noplt
+@cindex @code{noplt} function attribute
+The @code{noplt} attribute is the counterpart to option @option{-fno-plt} and
+does not use PLT for calls to functions marked with this attribute in position
+independent code. 
+
+@smallexample
+@group
+/* Externally defined function foo.  */
+int foo () __attribute__ ((noplt));
+
+int
+main (/* @r{@dots{}} */)
+@{
+  /* @r{@dots{}} */
+  foo ();
+  /* @r{@dots{}} */
+@}
+@end group
+@end smallexample
+
+The @code{noplt} attribute on function foo tells the compiler to assume that
+the function foo is externally defined and the call to foo must avoid the PLT
+in position independent code.
+
+Additionally, a few targets also convert calls to those functions that are
+marked to not use the PLT to use the GOT instead for non-position independent
+code.
+
 @item optimize
 @cindex @code{optimize} function attribute
 The @code{optimize} attribute is used to specify that a function is to
@@ -9124,6 +9153,7 @@ instructions, but allow the compiler to schedule those calls.
 * MIPS DSP Built-in Functions::
 * MIPS Paired-Single Support::
 * MIPS Loongson Built-in Functions::
+* MIPS SIMD Architecture Functions::
 * Other MIPS Built-in Functions::
 * MSP430 Built-in Functions::
 * NDS32 Built-in Functions::
@@ -12258,6 +12288,8 @@ value is the upper one.  The opposite order applies to big-endian targets.
 For example, the code above sets the lower half of @code{a} to
 @code{1.5} on little-endian targets and @code{9.1} on big-endian targets.
 
+
+
 @node MIPS Loongson Built-in Functions
 @subsection MIPS Loongson Built-in Functions
 
@@ -12384,6 +12416,786 @@ int16x4_t punpcklhw_s (int16x4_t s, int16x4_t t);
 int8x8_t punpcklbh_s (int8x8_t s, int8x8_t t);
 @end smallexample
 
+@node MIPS SIMD Architecture Functions
+@subsection MIPS SIMD Architecture Functions
+
+GCC provides intrinsics to access the SIMD instructions provided by the
+MSA MIPS SIMD Architecture.  The interface is made available by
+including @code{<msa.h>} and using @option{-mmsa -mhart-float -mfp64 -mnan=2008}
+
+@itemize
+@item @code{v16i8}, a vector of sixteen signed 8-bit integers;
+@item @code{v16u8}, a vector of sixteen unsigned 8-bit integers;
+@item @code{v8i16}, a vector of eight signed 16-bit integers;
+@item @code{v8u16}, a vector of eight unsigned 16-bit integers;
+@item @code{v4i32}, a vector of four signed 32-bit integers;
+@item @code{v4u32}, a vector of four unsigned 32-bit integers;
+@item @code{v2i64}, a vector of two signed 64-bit integers;
+@item @code{v2u64}, a vector of two unsigned 64-bit integers;
+@item @code{v4f32}, a vector of four 32-bit floats;
+@item @code{v2f64}, a vector of two 64-bit doubles.
+@end itemize
+
+@itemize
+@item @code{imm0_1}, an integer literal in range 0 to 1;
+@item @code{imm0_3}, an integer literal in range 0 to 3;
+@item @code{imm0_7}, an integer literal in range 0 to 7;
+@item @code{imm0_15}, an integer literal in range 0 to 15;
+@item @code{imm0_31}, an integer literal in range 0 to 31;
+@item @code{imm0_63}, an integer literal in range 0 to 63;
+@item @code{imm0_255}, an integer literal in range 0 to 255;
+@item @code{imm_n16_15}, an integer literal in range -16 to 15;
+@item @code{imm_n512_511}, an integer literal in range -512 to 511;
+@item @code{imm_n1024_1022}, an integer literal in range -512 to 511 left shifted by 1 bit, i.e., -1024, -1022, @dots{}, 1020, 1022;
+@item @code{imm_n2048_2044}, an integer literal in range -512 to 511 left shifted by 2 bits, i.e., -2048, -2044, @dots{}, 2040, 2044;
+@item @code{imm_n4096_4088}, an integer literal in range -512 to 511 left shifted by 3 bits, i.e., -4096, -4088, @dots{}, 4080, 4088;
+@item @code{imm1_4}, an integer literal in range 1 to 4.
+@end itemize
+
+@smallexample
+@{
+typedef int i32;
+#if __LONG_MAX__ == __LONG_LONG_MAX__
+typedef long i64;
+#else
+typedef long long i64;
+#endif
+
+typedef unsigned int u32;
+#if __LONG_MAX__ == __LONG_LONG_MAX__
+typedef unsigned long u64;
+#else
+typedef unsigned long long u64;
+#endif
+
+typedef double f64;
+typedef float f32;
+@}
+@end smallexample
+
+The intrinsics provided are listed below; each is named after the
+machine instruction.
+
+@smallexample
+v16i8 __builtin_msa_add_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_add_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_add_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_add_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_adds_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_adds_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_adds_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_adds_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_adds_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_adds_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_adds_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_adds_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_addv_b (v16i8, v16i8);
+v8i16 __builtin_msa_addv_h (v8i16, v8i16);
+v4i32 __builtin_msa_addv_w (v4i32, v4i32);
+v2i64 __builtin_msa_addv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_addvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_addvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_addvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_addvi_d (v2i64, imm0_31);
+
+v16u8 __builtin_msa_and_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_andi_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_asub_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_asub_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_asub_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_asub_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_asub_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_asub_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_asub_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_asub_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_ave_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_ave_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_ave_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_ave_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_ave_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_ave_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_ave_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_ave_u_d (v2u64, v2u64);
+
+16i8 __builtin_msa_aver_s_b (v16i8, v16i8);
+8i16 __builtin_msa_aver_s_h (v8i16, v8i16);
+4i32 __builtin_msa_aver_s_w (v4i32, v4i32);
+2i64 __builtin_msa_aver_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_aver_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_aver_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_aver_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_aver_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclr_b (v16u8, v16u8);
+v8u16 __builtin_msa_bclr_h (v8u16, v8u16);
+4u32 __builtin_msa_bclr_w (v4u32, v4u32);
+v2u64 __builtin_msa_bclr_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bclri_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bclri_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bclri_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bclri_d (v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsl_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsl_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsl_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsl_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsli_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsli_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsli_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsli_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_binsr_b (v16u8, v16u8, v16u8);
+v8u16 __builtin_msa_binsr_h (v8u16, v8u16, v8u16);
+v4u32 __builtin_msa_binsr_w (v4u32, v4u32, v4u32);
+v2u64 __builtin_msa_binsr_d (v2u64, v2u64, v2u64);
+
+v16u8 __builtin_msa_binsri_b (v16u8, v16u8, imm0_7);
+v8u16 __builtin_msa_binsri_h (v8u16, v8u16, imm0_15);
+v4u32 __builtin_msa_binsri_w (v4u32, v4u32, imm0_31);
+v2u64 __builtin_msa_binsri_d (v2u64, v2u64, imm0_63);
+
+v16u8 __builtin_msa_bmnz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmnzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bmz_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bmzi_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bneg_b (v16u8, v16u8);
+v8u16 __builtin_msa_bneg_h (v8u16, v8u16);
+v4u32 __builtin_msa_bneg_w (v4u32, v4u32);
+v2u64 __builtin_msa_bneg_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bnegi_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bnegi_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bnegi_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bnegi_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bnz_b (v16u8);
+i32 __builtin_msa_bnz_h (v8u16);
+i32 __builtin_msa_bnz_w (v4u32);
+i32 __builtin_msa_bnz_d (v2u64);
+
+i32 __builtin_msa_bnz_v (v16u8);
+
+v16u8 __builtin_msa_bsel_v (v16u8, v16u8, v16u8);
+
+v16u8 __builtin_msa_bseli_b (v16u8, v16u8, imm0_255);
+
+v16u8 __builtin_msa_bset_b (v16u8, v16u8);
+v8u16 __builtin_msa_bset_h (v8u16, v8u16);
+v4u32 __builtin_msa_bset_w (v4u32, v4u32);
+v2u64 __builtin_msa_bset_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_bseti_b (v16u8, imm0_7);
+v8u16 __builtin_msa_bseti_h (v8u16, imm0_15);
+v4u32 __builtin_msa_bseti_w (v4u32, imm0_31);
+v2u64 __builtin_msa_bseti_d (v2u64, imm0_63);
+
+i32 __builtin_msa_bz_b (v16u8);
+i32 __builtin_msa_bz_h (v8u16);
+i32 __builtin_msa_bz_w (v4u32);
+i32 __builtin_msa_bz_d (v2u64);
+
+i32 __builtin_msa_bz_v (v16u8);
+
+v16i8 __builtin_msa_ceq_b (v16i8, v16i8);
+v8i16 __builtin_msa_ceq_h (v8i16, v8i16);
+v4i32 __builtin_msa_ceq_w (v4i32, v4i32);
+v2i64 __builtin_msa_ceq_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ceqi_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_ceqi_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_ceqi_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_ceqi_d (v2i64, imm_n16_15);
+
+i32 __builtin_msa_cfcmsa (imm0_31);
+
+v16i8 __builtin_msa_cle_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_cle_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_cle_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_cle_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_cle_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_cle_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_cle_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_cle_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clei_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clei_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clei_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clei_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clei_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clei_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clei_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clei_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_clt_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_clt_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_clt_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_clt_s_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_clt_u_b (v16u8, v16u8);
+v8i16 __builtin_msa_clt_u_h (v8u16, v8u16);
+v4i32 __builtin_msa_clt_u_w (v4u32, v4u32);
+v2i64 __builtin_msa_clt_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_clti_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_clti_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_clti_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_clti_s_d (v2i64, imm_n16_15);
+
+v16i8 __builtin_msa_clti_u_b (v16u8, imm0_31);
+v8i16 __builtin_msa_clti_u_h (v8u16, imm0_31);
+v4i32 __builtin_msa_clti_u_w (v4u32, imm0_31);
+v2i64 __builtin_msa_clti_u_d (v2u64, imm0_31);
+
+i32 __builtin_msa_copy_s_b (v16i8, imm0_15);
+i32 __builtin_msa_copy_s_h (v8i16, imm0_7);
+i32 __builtin_msa_copy_s_w (v4i32, imm0_3);
+i64 __builtin_msa_copy_s_d (v2i64, imm0_1);
+
+u32 __builtin_msa_copy_u_b (v16i8, imm0_15);
+u32 __builtin_msa_copy_u_h (v8i16, imm0_7);
+u32 __builtin_msa_copy_u_w (v4i32, imm0_3);
+u64 __builtin_msa_copy_u_d (v2i64, imm0_1);
+
+void __builtin_msa_ctcmsa (imm0_31, i32);
+
+v16i8 __builtin_msa_div_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_div_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_div_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_div_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_div_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_div_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_div_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_div_u_d (v2u64, v2u64);
+
+v8i16 __builtin_msa_dotp_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_dotp_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_dotp_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_dotp_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_dotp_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_dotp_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_dpadd_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpadd_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpadd_s_d (v2i64, v4i32, v4i32);
+
+v8u16 __builtin_msa_dpadd_u_h (v8u16, v16u8, v16u8);
+v4u32 __builtin_msa_dpadd_u_w (v4u32, v8u16, v8u16);
+v2u64 __builtin_msa_dpadd_u_d (v2u64, v4u32, v4u32);
+
+v8i16 __builtin_msa_dpsub_s_h (v8i16, v16i8, v16i8);
+v4i32 __builtin_msa_dpsub_s_w (v4i32, v8i16, v8i16);
+v2i64 __builtin_msa_dpsub_s_d (v2i64, v4i32, v4i32);
+
+v8i16 __builtin_msa_dpsub_u_h (v8i16, v16u8, v16u8);
+v4i32 __builtin_msa_dpsub_u_w (v4i32, v8u16, v8u16);
+v2i64 __builtin_msa_dpsub_u_d (v2i64, v4u32, v4u32);
+
+v4f32 __builtin_msa_fadd_w (v4f32, v4f32);
+v2f64 __builtin_msa_fadd_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fceq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fceq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclass_w (v4f32);
+v2i64 __builtin_msa_fclass_d (v2f64);
+
+v4i32 __builtin_msa_fcle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fclt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fclt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcor_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fcune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fcune_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fdiv_w (v4f32, v4f32);
+v2f64 __builtin_msa_fdiv_d (v2f64, v2f64);
+
+v8i16 __builtin_msa_fexdo_h (v4f32, v4f32);
+v4f32 __builtin_msa_fexdo_w (v2f64, v2f64);
+
+v4f32 __builtin_msa_fexp2_w (v4f32, v4i32);
+v2f64 __builtin_msa_fexp2_d (v2f64, v2i64);
+
+v4f32 __builtin_msa_fexupl_w (v8i16);
+v2f64 __builtin_msa_fexupl_d (v4f32);
+
+v4f32 __builtin_msa_fexupr_w (v8i16);
+v2f64 __builtin_msa_fexupr_d (v4f32);
+
+v4f32 __builtin_msa_ffint_s_w (v4i32);
+v2f64 __builtin_msa_ffint_s_d (v2i64);
+
+v4f32 __builtin_msa_ffint_u_w (v4u32);
+v2f64 __builtin_msa_ffint_u_d (v2u64);
+
+v4f32 __builtin_msa_ffql_w (v8i16);
+v2f64 __builtin_msa_ffql_d (v4i32);
+
+v4f32 __builtin_msa_ffqr_w (v8i16);
+v2f64 __builtin_msa_ffqr_d (v4i32);
+
+v16i8 __builtin_msa_fill_b (i32);
+v8i16 __builtin_msa_fill_h (i32);
+v4i32 __builtin_msa_fill_w (i32);
+v2i64 __builtin_msa_fill_d (i64);
+
+v4f32 __builtin_msa_flog2_w (v4f32);
+v2f64 __builtin_msa_flog2_d (v2f64);
+
+v4f32 __builtin_msa_fmadd_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmadd_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmax_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmax_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmin_a_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmin_a_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fmsub_w (v4f32, v4f32, v4f32);
+v2f64 __builtin_msa_fmsub_d (v2f64, v2f64, v2f64);
+
+v4f32 __builtin_msa_fmul_w (v4f32, v4f32);
+v2f64 __builtin_msa_fmul_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_frint_w (v4f32);
+v2f64 __builtin_msa_frint_d (v2f64);
+
+v4f32 __builtin_msa_frcp_w (v4f32);
+v2f64 __builtin_msa_frcp_d (v2f64);
+
+v4f32 __builtin_msa_frsqrt_w (v4f32);
+v2f64 __builtin_msa_frsqrt_d (v2f64);
+
+v4i32 __builtin_msa_fsaf_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsaf_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fseq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fseq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsle_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsle_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fslt_w (v4f32, v4f32);
+v2i64 __builtin_msa_fslt_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsne_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsne_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsor_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsor_d (v2f64, v2f64);
+
+v4f32 __builtin_msa_fsqrt_w (v4f32);
+v2f64 __builtin_msa_fsqrt_d (v2f64);
+
+v4f32 __builtin_msa_fsub_w (v4f32, v4f32);
+v2f64 __builtin_msa_fsub_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsueq_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsueq_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsule_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsule_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsult_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsult_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsun_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsun_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_fsune_w (v4f32, v4f32);
+v2i64 __builtin_msa_fsune_d (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftint_s_w (v4f32);
+v2i64 __builtin_msa_ftint_s_d (v2f64);
+
+v4u32 __builtin_msa_ftint_u_w (v4f32);
+v2u64 __builtin_msa_ftint_u_d (v2f64);
+
+v8i16 __builtin_msa_ftq_h (v4f32, v4f32);
+v4i32 __builtin_msa_ftq_w (v2f64, v2f64);
+
+v4i32 __builtin_msa_ftrunc_s_w (v4f32);
+v2i64 __builtin_msa_ftrunc_s_d (v2f64);
+
+v4u32 __builtin_msa_ftrunc_u_w (v4f32);
+v2u64 __builtin_msa_ftrunc_u_d (v2f64);
+
+v8i16 __builtin_msa_hadd_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hadd_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hadd_s_d (v4i32, v4i32);
+
+v8u16 __builtin_msa_hadd_u_h (v16u8, v16u8);
+v4u32 __builtin_msa_hadd_u_w (v8u16, v8u16);
+v2u64 __builtin_msa_hadd_u_d (v4u32, v4u32);
+
+v8i16 __builtin_msa_hsub_s_h (v16i8, v16i8);
+v4i32 __builtin_msa_hsub_s_w (v8i16, v8i16);
+v2i64 __builtin_msa_hsub_s_d (v4i32, v4i32);
+
+v8i16 __builtin_msa_hsub_u_h (v16u8, v16u8);
+v4i32 __builtin_msa_hsub_u_w (v8u16, v8u16);
+v2i64 __builtin_msa_hsub_u_d (v4u32, v4u32);
+
+v16i8 __builtin_msa_ilvev_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvev_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvev_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvl_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvl_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvl_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvod_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvod_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvod_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_ilvr_b (v16i8, v16i8);
+v8i16 __builtin_msa_ilvr_h (v8i16, v8i16);
+v4i32 __builtin_msa_ilvr_w (v4i32, v4i32);
+v2i64 __builtin_msa_ilvr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_insert_b (v16i8, imm0_15, i32);
+v8i16 __builtin_msa_insert_h (v8i16, imm0_7, i32);
+v4i32 __builtin_msa_insert_w (v4i32, imm0_3, i32);
+v2i64 __builtin_msa_insert_d (v2i64, imm0_1, i64);
+
+v16i8 __builtin_msa_insve_b (v16i8, imm0_15, v16i8);
+v8i16 __builtin_msa_insve_h (v8i16, imm0_7, v8i16);
+v4i32 __builtin_msa_insve_w (v4i32, imm0_3, v4i32);
+v2i64 __builtin_msa_insve_d (v2i64, imm0_1, v2i64);
+
+v16i8 __builtin_msa_ld_b (void *, imm_n512_511);
+v8i16 __builtin_msa_ld_h (void *, imm_n1024_1022);
+v4i32 __builtin_msa_ld_w (void *, imm_n2048_2044);
+v2i64 __builtin_msa_ld_d (void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_ldi_b (imm_n512_511);
+v8i16 __builtin_msa_ldi_h (imm_n512_511);
+v4i32 __builtin_msa_ldi_w (imm_n512_511);
+v2i64 __builtin_msa_ldi_d (imm_n512_511);
+
+v8i16 __builtin_msa_madd_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_madd_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_maddr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_maddv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_maddv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_maddv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_maddv_d (v2i64, v2i64, v2i64);
+
+v16i8 __builtin_msa_max_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_max_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_max_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_max_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_max_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_max_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_max_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_max_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_max_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_maxi_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_maxi_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_maxi_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_maxi_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_maxi_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_maxi_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_maxi_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_maxi_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_min_a_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_a_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_a_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_a_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_min_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_min_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_min_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_min_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_min_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_min_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_min_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_min_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_mini_s_b (v16i8, imm_n16_15);
+v8i16 __builtin_msa_mini_s_h (v8i16, imm_n16_15);
+v4i32 __builtin_msa_mini_s_w (v4i32, imm_n16_15);
+v2i64 __builtin_msa_mini_s_d (v2i64, imm_n16_15);
+
+v16u8 __builtin_msa_mini_u_b (v16u8, imm0_31);
+v8u16 __builtin_msa_mini_u_h (v8u16, imm0_31);
+v4u32 __builtin_msa_mini_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_mini_u_d (v2u64, imm0_31);
+
+v16i8 __builtin_msa_mod_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_mod_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_mod_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_mod_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_mod_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_mod_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_mod_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_mod_u_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_move_v_b (v16i8);
+
+v8i16 __builtin_msa_msub_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msub_q_w (v4i32, v4i32, v4i32);
+
+v8i16 __builtin_msa_msubr_q_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubr_q_w (v4i32, v4i32, v4i32);
+
+v16i8 __builtin_msa_msubv_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_msubv_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_msubv_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_msubv_d (v2i64, v2i64, v2i64);
+
+v8i16 __builtin_msa_mul_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mul_q_w (v4i32, v4i32);
+
+v8i16 __builtin_msa_mulr_q_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulr_q_w (v4i32, v4i32);
+
+v16i8 __builtin_msa_mulv_b (v16i8, v16i8);
+v8i16 __builtin_msa_mulv_h (v8i16, v8i16);
+v4i32 __builtin_msa_mulv_w (v4i32, v4i32);
+v2i64 __builtin_msa_mulv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_nloc_b (v16i8);
+v8i16 __builtin_msa_nloc_h (v8i16);
+v4i32 __builtin_msa_nloc_w (v4i32);
+v2i64 __builtin_msa_nloc_d (v2i64);
+
+v16i8 __builtin_msa_nlzc_b (v16i8);
+v8i16 __builtin_msa_nlzc_h (v8i16);
+v4i32 __builtin_msa_nlzc_w (v4i32);
+v2i64 __builtin_msa_nlzc_d (v2i64);
+
+v16u8 __builtin_msa_nor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_nori_b (v16u8, imm0_255);
+
+v16u8 __builtin_msa_or_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_ori_b (v16u8, imm0_255);
+
+v16i8 __builtin_msa_pckev_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckev_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckev_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckev_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pckod_b (v16i8, v16i8);
+v8i16 __builtin_msa_pckod_h (v8i16, v8i16);
+v4i32 __builtin_msa_pckod_w (v4i32, v4i32);
+v2i64 __builtin_msa_pckod_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_pcnt_b (v16i8);
+v8i16 __builtin_msa_pcnt_h (v8i16);
+v4i32 __builtin_msa_pcnt_w (v4i32);
+v2i64 __builtin_msa_pcnt_d (v2i64);
+
+v16i8 __builtin_msa_sat_s_b (v16i8, imm0_7);
+v8i16 __builtin_msa_sat_s_h (v8i16, imm0_15);
+v4i32 __builtin_msa_sat_s_w (v4i32, imm0_31);
+v2i64 __builtin_msa_sat_s_d (v2i64, imm0_63);
+
+v16u8 __builtin_msa_sat_u_b (v16u8, imm0_7);
+v8u16 __builtin_msa_sat_u_h (v8u16, imm0_15);
+v4u32 __builtin_msa_sat_u_w (v4u32, imm0_31);
+v2u64 __builtin_msa_sat_u_d (v2u64, imm0_63);
+
+v16i8 __builtin_msa_shf_b (v16i8, imm0_255);
+
+v8i16 __builtin_msa_shf_h (v8i16, imm0_255);
+
+v4i32 __builtin_msa_shf_w (v4i32, imm0_255);
+
+v16i8 __builtin_msa_sld_b (v16i8, v16i8, i32);
+v8i16 __builtin_msa_sld_h (v8i16, v8i16, i32);
+v4i32 __builtin_msa_sld_w (v4i32, v4i32, i32);
+v2i64 __builtin_msa_sld_d (v2i64, v2i64, i32);
+
+v16i8 __builtin_msa_sldi_b (v16i8, v16i8, imm0_15);
+v8i16 __builtin_msa_sldi_h (v8i16, v8i16, imm0_7);
+v4i32 __builtin_msa_sldi_w (v4i32, v4i32, imm0_3);
+v2i64 __builtin_msa_sldi_d (v2i64, v2i64, imm0_1);
+
+v16i8 __builtin_msa_sll_b (v16i8, v16i8);
+v8i16 __builtin_msa_sll_h (v8i16, v8i16);
+v4i32 __builtin_msa_sll_w (v4i32, v4i32);
+v2i64 __builtin_msa_sll_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_slli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_slli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_slli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_slli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_splat_b (v16i8, i32);
+v8i16 __builtin_msa_splat_h (v8i16, i32);
+v4i32 __builtin_msa_splat_w (v4i32, i32);
+v2i64 __builtin_msa_splat_d (v2i64, i32);
+
+v16i8 __builtin_msa_splati_b (v16i8, imm0_15);
+v8i16 __builtin_msa_splati_h (v8i16, imm0_7);
+v4i32 __builtin_msa_splati_w (v4i32, imm0_3);
+v2i64 __builtin_msa_splati_d (v2i64, imm0_1);
+
+v16i8 __builtin_msa_sra_b (v16i8, v16i8);
+v8i16 __builtin_msa_sra_h (v8i16, v8i16);
+v4i32 __builtin_msa_sra_w (v4i32, v4i32);
+v2i64 __builtin_msa_sra_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srai_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srai_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srai_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srai_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srar_b (v16i8, v16i8);
+v8i16 __builtin_msa_srar_h (v8i16, v8i16);
+v4i32 __builtin_msa_srar_w (v4i32, v4i32);
+v2i64 __builtin_msa_srar_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srari_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srari_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srari_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srari_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srl_b (v16i8, v16i8);
+v8i16 __builtin_msa_srl_h (v8i16, v8i16);
+v4i32 __builtin_msa_srl_w (v4i32, v4i32);
+v2i64 __builtin_msa_srl_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srli_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srli_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srli_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srli_d (v2i64, imm0_63);
+
+v16i8 __builtin_msa_srlr_b (v16i8, v16i8);
+v8i16 __builtin_msa_srlr_h (v8i16, v8i16);
+v4i32 __builtin_msa_srlr_w (v4i32, v4i32);
+v2i64 __builtin_msa_srlr_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_srlri_b (v16i8, imm0_7);
+v8i16 __builtin_msa_srlri_h (v8i16, imm0_15);
+v4i32 __builtin_msa_srlri_w (v4i32, imm0_31);
+v2i64 __builtin_msa_srlri_d (v2i64, imm0_63);
+
+void __builtin_msa_st_b (v16i8, void *, imm_n512_511);
+void __builtin_msa_st_h (v8i16, void *, imm_n1024_1022);
+void __builtin_msa_st_w (v4i32, void *, imm_n2048_2044);
+void __builtin_msa_st_d (v2i64, void *, imm_n4096_4088);
+
+v16i8 __builtin_msa_subs_s_b (v16i8, v16i8);
+v8i16 __builtin_msa_subs_s_h (v8i16, v8i16);
+v4i32 __builtin_msa_subs_s_w (v4i32, v4i32);
+v2i64 __builtin_msa_subs_s_d (v2i64, v2i64);
+
+v16u8 __builtin_msa_subs_u_b (v16u8, v16u8);
+v8u16 __builtin_msa_subs_u_h (v8u16, v8u16);
+v4u32 __builtin_msa_subs_u_w (v4u32, v4u32);
+v2u64 __builtin_msa_subs_u_d (v2u64, v2u64);
+
+v16u8 __builtin_msa_subsus_u_b (v16u8, v16i8);
+v8u16 __builtin_msa_subsus_u_h (v8u16, v8i16);
+v4u32 __builtin_msa_subsus_u_w (v4u32, v4i32);
+v2u64 __builtin_msa_subsus_u_d (v2u64, v2i64);
+
+v16i8 __builtin_msa_subsuu_s_b (v16u8, v16u8);
+v8i16 __builtin_msa_subsuu_s_h (v8u16, v8u16);
+v4i32 __builtin_msa_subsuu_s_w (v4u32, v4u32);
+v2i64 __builtin_msa_subsuu_s_d (v2u64, v2u64);
+
+v16i8 __builtin_msa_subv_b (v16i8, v16i8);
+v8i16 __builtin_msa_subv_h (v8i16, v8i16);
+v4i32 __builtin_msa_subv_w (v4i32, v4i32);
+v2i64 __builtin_msa_subv_d (v2i64, v2i64);
+
+v16i8 __builtin_msa_subvi_b (v16i8, imm0_31);
+v8i16 __builtin_msa_subvi_h (v8i16, imm0_31);
+v4i32 __builtin_msa_subvi_w (v4i32, imm0_31);
+v2i64 __builtin_msa_subvi_d (v2i64, imm0_31);
+
+v16i8 __builtin_msa_vshf_b (v16i8, v16i8, v16i8);
+v8i16 __builtin_msa_vshf_h (v8i16, v8i16, v8i16);
+v4i32 __builtin_msa_vshf_w (v4i32, v4i32, v4i32);
+v2i64 __builtin_msa_vshf_d (v2i64, v2i64, v2i64);
+
+v16u8 __builtin_msa_xor_v (v16u8, v16u8);
+
+v16u8 __builtin_msa_xori_b (v16u8, imm0_255);
+
+v4f32 __builtin_msa_cast_to_vector_float (f32);
+v2f64 __builtin_msa_cast_to_vector_double (f64);
+f32 __builtin_msa_cast_to_scalar_float (v4f32);
+f64 __builtin_msa_cast_to_scalar_double (v2f64);
+
+i32 __builtin_msa_lsa (i32, i32, imm1_4);
+i64 __builtin_msa_dlsa (i64, i64, imm1_4);
+@end smallexample
+
 @menu
 * Paired-Single Arithmetic::
 * Paired-Single Built-in Functions::
diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index 8d188838d4f..9f66a47efec 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -1283,6 +1283,32 @@ ISA for floating-point arithmetics.  You can select either @samp{sse} which
 enables @option{-msse2} or @samp{avx} which enables @option{-mavx} by default.
 This option is only supported on i386 and x86-64 targets.
 
+@item --with-fp-32=@var{mode}
+On MIPS targets, set the default value for the @option{-mfp} option when using
+the o32 ABI.  The possibilities for @var{mode} are:
+@table @code
+@item 32
+Use the o32 FP32 ABI extension, as with the @option{-mfp32} command-line
+option.
+@item xx
+Use the o32 FPXX ABI extension, as with the @option{-mfpxx} command-line
+option.
+@item 64
+Use the o32 FP64 ABI extension, as with the @option{-mfp64} command-line
+option.
+@end table
+In the absence of this configuration option the default is to use the o32
+FP32 ABI extension.
+
+@item --with-odd-spreg-32
+On MIPS targets, set the @option{-modd-spreg} option by default when using
+the o32 ABI.
+
+@item --without-odd-spreg-32
+On MIPS targets, set the @option{-mno-odd-spreg} option by default when using
+the o32 ABI.  This is normally used in conjunction with
+@option{--with-fp-32=64} in order to target the o32 FP64A ABI extension.
+
 @item --with-nan=@var{encoding}
 On MIPS targets, set the default encoding convention to use for the
 special not-a-number (NaN) IEEE 754 floating-point data.  The
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d16df54350f..f8350c4183c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -777,21 +777,24 @@ Objective-C and Objective-C++ Dialects}.
 
 @emph{MIPS Options}
 @gccoptlist{-EL  -EB  -march=@var{arch}  -mtune=@var{arch} @gol
--mips1  -mips2  -mips3  -mips4  -mips32  -mips32r2 @gol
--mips64  -mips64r2 @gol
+-mips1  -mips2  -mips3  -mips4  -mips32  -mips32r2  -mips32r3  -mips32r5 @gol
+-mips32r6  -mips64  -mips64r2  -mips64r3  -mips64r5  -mips64r6 @gol
 -mips16  -mno-mips16  -mflip-mips16 @gol
 -minterlink-compressed -mno-interlink-compressed @gol
 -minterlink-mips16  -mno-interlink-mips16 @gol
 -mabi=@var{abi}  -mabicalls  -mno-abicalls @gol
 -mshared  -mno-shared  -mplt  -mno-plt  -mxgot  -mno-xgot @gol
--mgp32  -mgp64  -mfp32  -mfp64  -mhard-float  -msoft-float @gol
+-mgp32  -mgp64  -mfp32  -mfpxx  -mfp64  -mhard-float  -msoft-float @gol
 -mno-float  -msingle-float  -mdouble-float @gol
+-modd-spreg -mno-odd-spreg @gol
 -mabs=@var{mode}  -mnan=@var{encoding} @gol
 -mdsp  -mno-dsp  -mdspr2  -mno-dspr2 @gol
 -mmcu -mmno-mcu @gol
 -meva -mno-eva @gol
 -mvirt -mno-virt @gol
+-mxpa -mno-xpa @gol
 -mmicromips -mno-micromips @gol
+-mmsa -mno-msa @gol
 -mfpu=@var{fpu-type} @gol
 -msmartmips  -mno-smartmips @gol
 -mpaired-single  -mno-paired-single  -mdmx  -mno-mdmx @gol
@@ -1088,7 +1091,7 @@ See S/390 and zSeries Options.
 -finstrument-functions-exclude-function-list=@var{sym},@var{sym},@dots{} @gol
 -finstrument-functions-exclude-file-list=@var{file},@var{file},@dots{} @gol
 -fno-common  -fno-ident @gol
--fpcc-struct-return  -fpic  -fPIC -fpie -fPIE @gol
+-fpcc-struct-return  -fpic  -fPIC -fpie -fPIE -fno-plt @gol
 -fno-jump-tables @gol
 -frecord-gcc-switches @gol
 -freg-struct-return  -fshort-enums @gol
@@ -17387,7 +17390,9 @@ Generate code that runs on @var{arch}, which can be the name of a
 generic MIPS ISA, or the name of a particular processor.
 The ISA names are:
 @samp{mips1}, @samp{mips2}, @samp{mips3}, @samp{mips4},
-@samp{mips32}, @samp{mips32r2}, @samp{mips64} and @samp{mips64r2}.
+@samp{mips32}, @samp{mips32r2}, @samp{mips32r3}, @samp{mips32r5},
+@samp{mips32r6}, @samp{mips64}, @samp{mips64r2}, @samp{mips64r3},
+@samp{mips64r5} and @samp{mips64r6}.
 The processor names are:
 @samp{4kc}, @samp{4km}, @samp{4kp}, @samp{4ksc},
 @samp{4kec}, @samp{4kem}, @samp{4kep}, @samp{4ksd},
@@ -17401,8 +17406,9 @@ The processor names are:
 @samp{loongson2e}, @samp{loongson2f}, @samp{loongson3a},
 @samp{m4k},
 @samp{m14k}, @samp{m14kc}, @samp{m14ke}, @samp{m14kec},
-@samp{octeon}, @samp{octeon+}, @samp{octeon2},
+@samp{octeon}, @samp{octeon+}, @samp{octeon2}, @samp{octeon3},
 @samp{orion},
+@samp{p5600},
 @samp{r2000}, @samp{r3000}, @samp{r3900}, @samp{r4000}, @samp{r4400},
 @samp{r4600}, @samp{r4650}, @samp{r4700}, @samp{r6000}, @samp{r8000},
 @samp{rm7000}, @samp{rm9000},
@@ -17485,9 +17491,17 @@ Equivalent to @option{-march=mips4}.
 @opindex mips32
 Equivalent to @option{-march=mips32}.
 
-@item -mips32r2
-@opindex mips32r2
-Equivalent to @option{-march=mips32r2}.
+@item -mips32r3
+@opindex mips32r3
+Equivalent to @option{-march=mips32r3}.
+
+@item -mips32r5
+@opindex mips32r5
+Equivalent to @option{-march=mips32r5}.
+
+@item -mips32r6
+@opindex mips32r6
+Equivalent to @option{-march=mips32r6}.
 
 @item -mips64
 @opindex mips64
@@ -17497,6 +17511,18 @@ Equivalent to @option{-march=mips64}.
 @opindex mips64r2
 Equivalent to @option{-march=mips64r2}.
 
+@item -mips64r3
+@opindex mips64r3
+Equivalent to @option{-march=mips64r3}.
+
+@item -mips64r5
+@opindex mips64r5
+Equivalent to @option{-march=mips64r5}.
+
+@item -mips64r6
+@opindex mips64r6
+Equivalent to @option{-march=mips64r6}.
+
 @item -mips16
 @itemx -mno-mips16
 @opindex mips16
@@ -17557,14 +17583,27 @@ GCC supports a variant of the o32 ABI in which floating-point registers
 are 64 rather than 32 bits wide.  You can select this combination with
 @option{-mabi=32} @option{-mfp64}.  This ABI relies on the @code{mthc1}
 and @code{mfhc1} instructions and is therefore only supported for
-MIPS32R2 processors.
+MIPS32R2, MIPS32R3 and MIPS32R5 processors.
 
 The register assignments for arguments and return values remain the
 same, but each scalar value is passed in a single 64-bit register
 rather than a pair of 32-bit registers.  For example, scalar
 floating-point values are returned in @samp{$f0} only, not a
 @samp{$f0}/@samp{$f1} pair.  The set of call-saved registers also
-remains the same, but all 64 bits are saved.
+remains the same in that the even-numbered double-precision registers
+are saved.
+
+Two additional variants of the o32 ABI are supported to enable
+a transition from 32-bit to 64-bit registers.  These are FPXX
+(@option{-mfpxx}) and FP64A (@option{-mfp64} @option{-mno-odd-spreg}).
+The FPXX extension mandates that all code must execute correctly
+when run using 32-bit or 64-bit registers.  The code can be interlinked
+with either FP32 or FP64, but not both.
+The FP64A extension is similar to the FP64 extension but forbids the
+use of odd-numbered single-precision registers.  This can be used
+in conjunction with the @code{FRE} mode of FPUs in MIPS32R5
+processors and allows both FP32 and FP64A code to interlink and
+run in the same process without changing FPU modes.
 
 @item -mabicalls
 @itemx -mno-abicalls
@@ -17653,6 +17692,10 @@ Assume that floating-point registers are 32 bits wide.
 @opindex mfp64
 Assume that floating-point registers are 64 bits wide.
 
+@item -mfpxx
+@opindex mfpxx
+Do not assume the width of floating-point registers.
+
 @item -mhard-float
 @opindex mhard-float
 Use floating-point coprocessor instructions.
@@ -17684,6 +17727,15 @@ operations.
 Assume that the floating-point coprocessor supports double-precision
 operations.  This is the default.
 
+@item -modd-spreg
+@itemx -mno-odd-spreg
+@opindex modd-spreg
+@opindex mno-odd-spreg
+Enable the use of odd-numbered single-precision floating-point registers
+for the o32 ABI.  This is the default for processors that are known to
+support these registers.  When using the o32 FPXX ABI, @code{-mno-odd-spreg}
+is set by default.
+
 @item -mabs=2008
 @itemx -mabs=legacy
 @opindex mabs=2008
@@ -17823,6 +17875,12 @@ Use (do not use) the MIPS Enhanced Virtual Addressing instructions.
 @opindex mno-virt
 Use (do not use) the MIPS Virtualization Application Specific instructions.
 
+@item -mxpa
+@itemx -mno-xpa
+@opindex mxpa
+@opindex mno-xpa
+Use (do not use) the MIPS eXtended Physical Address (XPA) instructions.
+
 @item -mlong64
 @opindex mlong64
 Force @code{long} types to be 64 bits wide.  See @option{-mlong32} for
@@ -22670,6 +22728,24 @@ used during linking.
 @code{__pie__} and @code{__PIE__}.  The macros have the value 1
 for @option{-fpie} and 2 for @option{-fPIE}.
 
+@item -fno-plt
+@opindex fno-plt
+Do not use PLT for external function calls in position-independent code.
+Instead, load callee address at call site from GOT and branch to it.
+This leads to more efficient code by eliminating PLT stubs and exposing
+GOT load to optimizations.  On architectures such as 32-bit x86 where
+PLT stubs expect GOT pointer in a specific register, this gives more
+register allocation freedom to the compiler.  Lazy binding requires PLT:
+with @option{-fno-plt} all external symbols are resolved at load time.
+
+Alternatively, function attribute @code{noplt} can be used to avoid PLT
+for calls to specific external functions by marking those functions with
+this attribute.
+
+Additionally, a few targets also convert calls to those functions that are
+marked to not use the PLT to use the GOT instead for non-position independent
+code.
+
 @item -fno-jump-tables
 @opindex fno-jump-tables
 Do not use jump tables for switch statements even where it would be
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 711d655577c..b210655f24c 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -3025,10 +3025,8 @@ operands can be used for microMIPS instructions such as @code{ll} and
 equivalent to @code{R}.
 
 @item ZD
-When compiling microMIPS code, this constraint matches an address operand
-that is formed from a base register and a 12-bit offset.  These operands
-can be used for microMIPS instructions such as @code{prefetch}.  When
-not compiling for microMIPS code, @code{ZD} is equivalent to @code{p}.
+An address suitable for a @code{prefetch} instruction, or for any other
+instruction with the same addressing mode as @code{prefetch}.
 @end table
 
 @item Motorola 680x0---@file{config/m68k/constraints.md}
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index bde7cb11640..2f3845ce085 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -9023,6 +9023,13 @@ register in Dwarf.  Otherwise, this hook should return @code{NULL_RTX}.
 If not defined, the default is to return @code{NULL_RTX}.
 @end deftypefn
 
+@deftypefn {Target Hook} {enum machine_mode} TARGET_DWARF_FRAME_REG_MODE (int @var{regno})
+Given a register, this hook should return the mode which the
+corresponding Dwarf frame register should have.  This is normally
+used to return a smaller mode than the raw mode to prevent call
+clobbered parts of a register altering the frame register size
+@end deftypefn
+
 @deftypefn {Target Hook} void TARGET_INIT_DWARF_REG_SIZES_EXTRA (tree @var{address})
 If some registers are represented in Dwarf-2 unwind information in
 multiple pieces, define this hook to fill in information about the
@@ -11317,16 +11324,6 @@ routine for target specific customizations of the system printf
 and scanf formatter settings.
 @end defmac
 
-@deftypevr {Target Hook} bool TARGET_RELAXED_ORDERING
-If set to @code{true}, means that the target's memory model does not
-guarantee that loads which do not depend on one another will access
-main memory in the order of the instruction stream; if ordering is
-important, an explicit memory barrier must be used.  This is true of
-many recent processors which implement a policy of ``relaxed,''
-``weak,'' or ``release'' memory consistency, such as Alpha, PowerPC,
-and ia64.  The default is @code{false}.
-@end deftypevr
-
 @deftypefn {Target Hook} {const char *} TARGET_INVALID_ARG_FOR_UNPROTOTYPED_FN (const_tree @var{typelist}, const_tree @var{funcdecl}, const_tree @var{val})
 If defined, this macro returns the diagnostic message when it is
 illegal to pass argument @var{val} to function @var{funcdecl}
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index c95c340fc25..3e9d2fbdca4 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -6748,6 +6748,8 @@ the target supports DWARF 2 frame unwind information.
 
 @hook TARGET_DWARF_REGISTER_SPAN
 
+@hook TARGET_DWARF_FRAME_REG_MODE
+
 @hook TARGET_INIT_DWARF_REG_SIZES_EXTRA
 
 @hook TARGET_ASM_TTYPE
@@ -8408,8 +8410,6 @@ routine for target specific customizations of the system printf
 and scanf formatter settings.
 @end defmac
 
-@hook TARGET_RELAXED_ORDERING
-
 @hook TARGET_INVALID_ARG_FOR_UNPROTOTYPED_FN
 
 @hook TARGET_INVALID_CONVERSION
diff --git a/gcc/dwarf2cfi.c b/gcc/dwarf2cfi.c
index abcdeb34461..968d7141d31 100644
--- a/gcc/dwarf2cfi.c
+++ b/gcc/dwarf2cfi.c
@@ -271,11 +271,9 @@ expand_builtin_init_dwarf_reg_sizes (tree address)
       if (rnum < DWARF_FRAME_REGISTERS)
 	{
 	  HOST_WIDE_INT offset = rnum * GET_MODE_SIZE (mode);
-	  enum machine_mode save_mode = reg_raw_mode[i];
 	  HOST_WIDE_INT size;
+	  enum machine_mode save_mode = targetm.dwarf_frame_reg_mode (i);
 
-	  if (HARD_REGNO_CALL_PART_CLOBBERED (i, save_mode))
-	    save_mode = choose_hard_reg_mode (i, 1, true);
 	  if (dnum == DWARF_FRAME_RETURN_COLUMN)
 	    {
 	      if (save_mode == VOIDmode)
diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 4d2466daa68..edcb553f11b 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -21506,8 +21506,14 @@ add_subprog_entry (tree decl, bool is_inlined)
       entry->subprog_num = 0;
       *slot = entry;
     }
-  else if (is_inlined)
-    (*slot)->is_inlined = true;
+  else if (is_inlined && !(*slot)->is_inlined)
+    {
+      /* If we've already output this subprogram entry as a non-inlined
+         subprogram, make sure it gets output again, so that we include
+         its linkage name.  */
+      (*slot)->is_inlined = true;
+      (*slot)->subprog_num = 0;
+    }
   return *slot;
 }
 
@@ -21617,7 +21623,10 @@ scan_blocks_for_inlined_calls (tree block, subprog_entry *subprog,
 
       for (i = 0; i < level; i++)
 	fprintf(stderr, "  ");
-      fprintf (stderr, "SCAN: block %d, subprog %s", BLOCK_NUMBER (block), dwarf2_name (subprog->decl, 0));
+      fprintf (stderr, "SCAN: [%p] block %d, subprog %s",
+	       (void *) block,
+	       BLOCK_NUMBER (block),
+	       dwarf2_name (subprog->decl, 0));
       if (caller != NULL)
 	{
 	  expanded_location loc = expand_location (caller_loc);
@@ -21670,6 +21679,21 @@ scan_blocks_for_inlined_calls (tree block, subprog_entry *subprog,
        subblock != NULL;
        subblock = BLOCK_FRAGMENT_CHAIN (subblock))
     {
+#ifdef DEBUG_TWO_LEVEL
+      if (level < 6)
+	{
+	  unsigned int i;
+
+	  for (i = 0; i < level; i++)
+	    fprintf(stderr, "  ");
+	  fprintf (stderr, "SCAN: [%p] block frag %d, origin %d\n",
+		   (void *) subblock,
+		   BLOCK_NUMBER (subblock),
+		   (BLOCK_FRAGMENT_ORIGIN (subblock)
+		    ? BLOCK_NUMBER (BLOCK_FRAGMENT_ORIGIN (subblock))
+		    : -1));
+	}
+#endif
       block_num = BLOCK_NUMBER (subblock);
       slot = block_table->find_slot_with_hash (&block_num, (hashval_t) block_num, INSERT);
       if (*slot == HTAB_EMPTY_ENTRY)
@@ -21717,6 +21741,7 @@ dwarf2out_begin_function (tree fun)
       subprog_entry *subprog;
 
       block_table->empty ();
+      logical_table->empty ();
 #ifdef DEBUG_TWO_LEVEL
       fprintf (stderr, "Begin function %s\n", dwarf2_name (fun, 0));
 #endif
@@ -21798,22 +21823,39 @@ out_subprog_directive (subprog_entry *subprog)
 {
   tree decl = subprog->decl;
   tree decl_name = DECL_NAME (decl);
-  const char *name;
+  tree origin = NULL_TREE;
+  const char *name = NULL;
   unsigned int file_num = 0;
   unsigned int line_num = 0;
 
   if (decl_name == NULL || IDENTIFIER_POINTER (decl_name) == NULL)
     return;
 
-  /* For inlined subroutines, use the linkage name.  */
-  if (subprog->is_inlined && DECL_ASSEMBLER_NAME (decl))
+  origin = decl_ultimate_origin (decl);
+  if (origin == NULL_TREE)
+    origin = decl;
+
+  /* For inlined subroutines, use the linkage name.
+     If -ftwo-level-all-subprogs is set, use the linkage name
+     for all subroutines.  */
+  if (subprog->is_inlined || flag_two_level_all_subprogs)
     {
-      name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
-      if (name[0] == '*')
-        name++;
+      if (DECL_ASSEMBLER_NAME (origin))
+	{
+	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (origin));
+	  if (name[0] == '*')
+	    name++;
+	}
+      else
+	name = dwarf2_name (origin, 0);
     }
   else
-    name = dwarf2_name (decl, 0);
+    {
+      /* To save space, we don't emit the name for non-inlined
+         subroutines, whose linkage names are available from the
+         object file's symbol table.  */
+      name = "";
+    }
 
   if (LOCATION_LOCUS (DECL_SOURCE_LOCATION (decl)) != UNKNOWN_LOCATION)
     {
@@ -21862,7 +21904,7 @@ out_logical_entry (dw_line_info_table *table, unsigned int file_num,
   /* Declare the subprogram if it hasn't already been declared.  */
   if (block != NULL)
     subprog = block->subprog;
-  if (subprog != NULL && subprog->subprog_num == 0 && context != NULL)
+  if (subprog != NULL && subprog->subprog_num == 0)
     out_subprog_directive (subprog);
   if (subprog != NULL)
     subprog_num = subprog->subprog_num;
diff --git a/gcc/explow.c b/gcc/explow.c
index 48e91a6444b..4e0aedf6bb2 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -137,7 +137,9 @@ plus_constant (enum machine_mode mode, rtx x, HOST_WIDE_INT c)
 	{
 	  tem = plus_constant (mode, get_pool_constant (XEXP (x, 0)), c);
 	  tem = force_const_mem (GET_MODE (x), tem);
-	  if (memory_address_p (GET_MODE (tem), XEXP (tem, 0)))
+	  /* Targets may disallow some constants in the constant pool, thus
+	     force_const_mem may return NULL_RTX.  */
+	  if (tem && memory_address_p (GET_MODE (tem), XEXP (tem, 0)))
 	    return tem;
 	}
       break;
diff --git a/gcc/final.c b/gcc/final.c
index f12084a1553..c7efcc5653f 100644
--- a/gcc/final.c
+++ b/gcc/final.c
@@ -2171,12 +2171,12 @@ final_scan_insn (rtx insn, FILE *file, int optimize_p ATTRIBUTE_UNUSED,
 	     suffixing "cold" to the original function's name.  */
 	  if (in_cold_section_p)
 	    {
-	      cold_partition_name
+	      cold_function_name
 		= clone_function_name (current_function_decl, "cold");
 #ifdef ASM_DECLARE_FUNCTION_NAME
-	      ASM_DECLARE_FUNCTION_NAME (asm_out_file,
-					 IDENTIFIER_POINTER (cold_partition_name),
-					 current_function_decl);
+              ASM_DECLARE_FUNCTION_NAME (asm_out_file,
+                                         IDENTIFIER_POINTER (cold_function_name),
+                                         current_function_decl);
 #else
 	      ASM_OUTPUT_LABEL (asm_out_file,
 				IDENTIFIER_POINTER (cold_partition_name));
diff --git a/gcc/function.h b/gcc/function.h
index 516a73eaef5..ccd5e6681b4 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -607,6 +607,9 @@ struct GTY(()) function {
      a string describing the reason for failure.  */
   const char * GTY((skip)) cannot_be_copied_reason;
 
+  /* Last assigned dependence info clique.  */
+  unsigned short last_clique;
+
   /* Collected bit flags.  */
 
   /* Number of units of general registers that need saving in stdarg
diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c
index 017bdb7cd2b..bb4b4168e3f 100644
--- a/gcc/ipa-devirt.c
+++ b/gcc/ipa-devirt.c
@@ -537,7 +537,7 @@ dump_type_inheritance_graph (FILE *f)
    Lookup this pointer and get its type.    */
 
 tree
-method_class_type (tree t)
+method_class_type (const_tree t)
 {
   tree first_parm_type = TREE_VALUE (TYPE_ARG_TYPES (t));
   gcc_assert (TREE_CODE (t) == METHOD_TYPE);
@@ -890,6 +890,31 @@ devirt_node_removal_hook (struct cgraph_node *n, void *d ATTRIBUTE_UNUSED)
     free_polymorphic_call_targets_hash ();
 }
 
+/* Return true when TYPE contains an polymorphic type and thus is interesting
+  for devirtualization machinery.  */
+
+bool
+contains_polymorphic_type_p (const_tree type)
+{
+  type = TYPE_MAIN_VARIANT (type);
+
+  if (RECORD_OR_UNION_TYPE_P (type))
+    {
+      if (TYPE_BINFO (type)
+          && polymorphic_type_binfo_p (TYPE_BINFO (type)))
+	return true;
+      for (tree fld = TYPE_FIELDS (type); fld; fld = DECL_CHAIN (fld))
+	if (TREE_CODE (fld) == FIELD_DECL
+	    && !DECL_ARTIFICIAL (fld)
+	    && contains_polymorphic_type_p (TREE_TYPE (fld)))
+	  return true;
+      return false;
+    }
+  if (TREE_CODE (type) == ARRAY_TYPE)
+    return contains_polymorphic_type_p (TREE_TYPE (type));
+  return false;
+}
+
 /* CONTEXT->OUTER_TYPE is a type of memory object where object of EXPECTED_TYPE
    is contained at CONTEXT->OFFSET.  Walk the memory representation of
    CONTEXT->OUTER_TYPE and find the outermost class type that match
@@ -1052,7 +1077,8 @@ subbinfo_with_vtable_at_offset (tree binfo, unsigned HOST_WIDE_INT offset,
    Return false if T does not look like virtual table reference.  */
 
 bool
-vtable_pointer_value_to_vtable (tree t, tree *v, unsigned HOST_WIDE_INT *offset)
+vtable_pointer_value_to_vtable (const_tree t, tree *v,
+				unsigned HOST_WIDE_INT *offset)
 {
   /* We expect &MEM[(void *)&virtual_table + 16B].
      We obtain object's BINFO from the context of the virtual table. 
@@ -1098,7 +1124,7 @@ vtable_pointer_value_to_vtable (tree t, tree *v, unsigned HOST_WIDE_INT *offset)
    instance type.  */
 
 tree
-vtable_pointer_value_to_binfo (tree t)
+vtable_pointer_value_to_binfo (const_tree t)
 {
   tree vtable;
   unsigned HOST_WIDE_INT offset;
diff --git a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c
index a512ec5b5bb..904a5219273 100644
--- a/gcc/ipa-inline-analysis.c
+++ b/gcc/ipa-inline-analysis.c
@@ -2169,6 +2169,8 @@ param_change_prob (gimple stmt, int i)
 	return 0;
       if (!bb->frequency)
 	return REG_BR_PROB_BASE;
+      if (!optimize)
+	return REG_BR_PROB_BASE;
       ao_ref_init (&refd, op);
       info.stmt = stmt;
       info.bb_set = BITMAP_ALLOC (NULL);
diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
index 2ccbbdbca9a..adfcb0354e2 100644
--- a/gcc/ipa-prop.c
+++ b/gcc/ipa-prop.c
@@ -394,11 +394,10 @@ static void
 ipa_set_jf_known_type (struct ipa_jump_func *jfunc, HOST_WIDE_INT offset,
 		       tree base_type, tree component_type)
 {
-  gcc_assert (TREE_CODE (component_type) == RECORD_TYPE
-	      && TYPE_BINFO (component_type));
+  gcc_assert (contains_polymorphic_type_p (base_type)
+	      && contains_polymorphic_type_p (component_type));
   if (!flag_devirtualize)
     return;
-  gcc_assert (BINFO_VTABLE (TYPE_BINFO (component_type)));
   jfunc->type = IPA_JF_KNOWN_TYPE;
   jfunc->value.known_type.offset = offset,
   jfunc->value.known_type.base_type = base_type;
@@ -485,10 +484,9 @@ ipa_set_ancestor_jf (struct ipa_jump_func *jfunc, HOST_WIDE_INT offset,
 {
   if (!flag_devirtualize)
     type_preserved = false;
-  gcc_assert (!type_preserved
-	      || (TREE_CODE (type) == RECORD_TYPE
-		  && TYPE_BINFO (type)
-		  && BINFO_VTABLE (TYPE_BINFO (type))));
+  if (!type_preserved)
+    type = NULL_TREE;
+  gcc_assert (!type_preserved || contains_polymorphic_type_p (type));
   jfunc->type = IPA_JF_ANCESTOR;
   jfunc->value.ancestor.formal_id = formal_id;
   jfunc->value.ancestor.offset = offset;
@@ -689,15 +687,9 @@ detect_type_change (tree arg, tree base, tree comp_type, gimple call,
   gcc_checking_assert (DECL_P (arg)
 		       || TREE_CODE (arg) == MEM_REF
 		       || handled_component_p (arg));
-  /* Const calls cannot call virtual methods through VMT and so type changes do
-     not matter.  */
-  if (!flag_devirtualize || !gimple_vuse (call)
-      /* Be sure expected_type is polymorphic.  */
-      || !comp_type
-      || TREE_CODE (comp_type) != RECORD_TYPE
-      || !TYPE_BINFO (comp_type)
-      || !BINFO_VTABLE (TYPE_BINFO (comp_type)))
-    return true;
+
+  if (!flag_devirtualize)
+    return false;
 
   /* C++ methods are not allowed to change THIS pointer unless they
      are constructors or destructors.  */
@@ -710,7 +702,20 @@ detect_type_change (tree arg, tree base, tree comp_type, gimple call,
       && !DECL_CXX_DESTRUCTOR_P (current_function_decl)
       && (SSA_NAME_VAR (TREE_OPERAND (base, 0))
 	  == DECL_ARGUMENTS (current_function_decl)))
-    return false;
+    {
+      gcc_assert (comp_type);
+      return false;
+    }
+
+  /* Const calls cannot call virtual methods through VMT and so type changes do
+     not matter.  */
+  if (!flag_devirtualize || !gimple_vuse (call)
+      /* Be sure expected_type is polymorphic.  */
+      || !comp_type
+      || TREE_CODE (comp_type) != RECORD_TYPE
+      || !TYPE_BINFO (TYPE_MAIN_VARIANT (comp_type))
+      || !BINFO_VTABLE (TYPE_BINFO (TYPE_MAIN_VARIANT (comp_type))))
+    return true;
 
   ao_ref_init (&ao, arg);
   ao.base = base;
@@ -1111,8 +1116,9 @@ compute_complex_assign_jump_func (struct ipa_node_params *info,
   index = ipa_get_param_decl_index (info, SSA_NAME_VAR (ssa));
   if (index >= 0 && param_type && POINTER_TYPE_P (param_type))
     {
-      bool type_p = !detect_type_change (op1, base, TREE_TYPE (param_type),
-					 call, jfunc, offset);
+      bool type_p = (contains_polymorphic_type_p (TREE_TYPE (param_type))
+		     && !detect_type_change (op1, base, TREE_TYPE (param_type),
+					     call, jfunc, offset));
       if (type_p || jfunc->type == IPA_JF_UNKNOWN)
 	ipa_set_ancestor_jf (jfunc, offset,
 			     type_p ? TREE_TYPE (param_type) : NULL, index,
@@ -1244,7 +1250,8 @@ compute_complex_ancestor_jump_func (struct ipa_node_params *info,
     }
 
   bool type_p = false;
-  if (param_type && POINTER_TYPE_P (param_type))
+  if (param_type && POINTER_TYPE_P (param_type)
+      && contains_polymorphic_type_p (TREE_TYPE (param_type)))
     type_p = !detect_type_change (obj, expr, TREE_TYPE (param_type),
 				  call, jfunc, offset);
   if (type_p || jfunc->type == IPA_JF_UNKNOWN)
@@ -1267,12 +1274,10 @@ compute_known_type_jump_func (tree op, struct ipa_jump_func *jfunc,
 
   if (!flag_devirtualize
       || TREE_CODE (op) != ADDR_EXPR
-      || TREE_CODE (TREE_TYPE (TREE_TYPE (op))) != RECORD_TYPE
+      || !contains_polymorphic_type_p (TREE_TYPE (TREE_TYPE (op)))
       /* Be sure expected_type is polymorphic.  */
       || !expected_type
-      || TREE_CODE (expected_type) != RECORD_TYPE
-      || !TYPE_BINFO (expected_type)
-      || !BINFO_VTABLE (TYPE_BINFO (expected_type)))
+      || !contains_polymorphic_type_p (expected_type))
     return;
 
   op = TREE_OPERAND (op, 0);
@@ -1280,7 +1285,7 @@ compute_known_type_jump_func (tree op, struct ipa_jump_func *jfunc,
   if (!DECL_P (base)
       || max_size == -1
       || max_size != size
-      || TREE_CODE (TREE_TYPE (base)) != RECORD_TYPE
+      || !contains_polymorphic_type_p (TREE_TYPE (base))
       || is_global_var (base))
     return;
 
diff --git a/gcc/ipa-utils.h b/gcc/ipa-utils.h
index 4d609096aa3..b1b0d1a8ad9 100644
--- a/gcc/ipa-utils.h
+++ b/gcc/ipa-utils.h
@@ -87,15 +87,16 @@ void dump_possible_polymorphic_call_targets (FILE *, tree, HOST_WIDE_INT,
 					     const ipa_polymorphic_call_context &);
 bool possible_polymorphic_call_target_p (tree, HOST_WIDE_INT,
 				         const ipa_polymorphic_call_context &,
-					 struct cgraph_node *n);
-tree method_class_type (tree);
+					 struct cgraph_node *);
+tree method_class_type (const_tree);
 tree get_polymorphic_call_info (tree, tree, tree *,
 				HOST_WIDE_INT *,
 				ipa_polymorphic_call_context *);
 bool get_polymorphic_call_info_from_invariant (ipa_polymorphic_call_context *,
 					       tree, tree, HOST_WIDE_INT);
-tree vtable_pointer_value_to_binfo (tree t);
-bool vtable_pointer_value_to_vtable (tree, tree *, unsigned HOST_WIDE_INT *);
+tree vtable_pointer_value_to_binfo (const_tree);
+bool vtable_pointer_value_to_vtable (const_tree, tree *, unsigned HOST_WIDE_INT *);
+bool contains_polymorphic_type_p (const_tree);
 
 /* Return vector containing possible targets of polymorphic call E.
    If FINALP is non-NULL, store true if the list is complette. 
diff --git a/gcc/l-ipo.c b/gcc/l-ipo.c
index 7ed78ea3f2c..c9e1f492f16 100644
--- a/gcc/l-ipo.c
+++ b/gcc/l-ipo.c
@@ -414,7 +414,7 @@ pop_module_scope (void)
   at_eof = 1;
   cgraph_process_same_body_aliases ();
   lang_hooks.l_ipo.process_pending_decls (input_location);
-  lang_hooks.l_ipo.clear_deferred_fns ();
+  lang_hooks.l_ipo.reset_parsing_state ();
   at_eof = 0;
 
   is_last = is_last_module (current_module_id);
@@ -1120,6 +1120,27 @@ cgraph_unify_type_alias_sets (void)
   htab_delete (type_hash_tab);
 }
 
+/* Return true if DECL is an artificial function that we do not want
+   to promote and which may not be available in the primary module.
+   The sole exception is currently __tls_init.  */
+
+static bool
+decl_artificial_nopromote (tree decl)
+{
+  if (!DECL_ARTIFICIAL (decl))
+    return false;
+
+  /* Handle the __tls_init function specially as we do want to promote it and
+     allow the aux module to be resolved to the version in the primary module.
+     We check if it is prefixed by __tls_init to catch it after promotion
+     as well from cgraph_is_aux_decl_external.  */
+  if (!strncmp (IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)), "__tls_init",
+                10))
+    return false;
+
+  return true;
+}
+
 /* Return true if NODE->decl from an auxiliary module has external
    definition (and therefore is not needed for expansion).  */
 
@@ -1144,7 +1165,7 @@ cgraph_is_aux_decl_external (struct cgraph_node *node)
      Functions marked artificial (e.g. an implicitly instantiated virtual
      destructor) are also not guaranteed to be available in the primary module,
      as they are not promoted by process_module_scope_static_func.  */
-  if (DECL_COMDAT (decl) || DECL_WEAK (decl) || DECL_ARTIFICIAL (decl))
+  if (DECL_COMDAT (decl) || DECL_WEAK (decl) || decl_artificial_nopromote (decl))
     return false;
 
   /* virtual functions won't be deleted in the primary module.  */
@@ -2022,7 +2043,7 @@ process_module_scope_static_func (struct cgraph_node *cnode)
   if (TREE_PUBLIC (decl)
       || !TREE_STATIC (decl)
       || DECL_EXTERNAL (decl)
-      || DECL_ARTIFICIAL (decl))
+      || decl_artificial_nopromote (decl))
     return;
 
   if (flag_ripa_no_promote_always_inline
diff --git a/gcc/langhooks-def.h b/gcc/langhooks-def.h
index 3cc155fedf3..c084b103d41 100644
--- a/gcc/langhooks-def.h
+++ b/gcc/langhooks-def.h
@@ -214,7 +214,7 @@ extern tree lhd_make_node (enum tree_code);
 #define LANG_HOOKS_DUP_LANG_TYPE lhd_do_nothing_t_t
 #define LANG_HOOKS_COPY_LANG_TYPE lhd_do_nothing_t_t
 #define LANG_HOOKS_PROCESS_PENDING_DECLS lhd_do_nothing_u
-#define LANG_HOOKS_CLEAR_DEFFERED_FNS lhd_do_nothing
+#define LANG_HOOKS_RESET_PARSING_STATE lhd_do_nothing
 #define LANG_HOOKS_IS_GENERATED_TYPE lhd_do_nothing_t_return_bool
 #define LANG_HOOKS_CMP_LANG_TYPE lhd_do_nothing_t_t_return_int
 
@@ -231,7 +231,7 @@ extern tree lhd_make_node (enum tree_code);
   LANG_HOOKS_DUP_LANG_TYPE, \
   LANG_HOOKS_COPY_LANG_TYPE, \
   LANG_HOOKS_PROCESS_PENDING_DECLS, \
-  LANG_HOOKS_CLEAR_DEFFERED_FNS, \
+  LANG_HOOKS_RESET_PARSING_STATE, \
   LANG_HOOKS_IS_GENERATED_TYPE, \
   LANG_HOOKS_CMP_LANG_TYPE,  \
 }
diff --git a/gcc/langhooks.h b/gcc/langhooks.h
index f24bcdd82b0..e89a58b1875 100644
--- a/gcc/langhooks.h
+++ b/gcc/langhooks.h
@@ -275,8 +275,8 @@ struct lang_hooks_for_lipo
   /* Process decls after parsing of a source module.  */
   void (*process_pending_decls) (unsigned);
 
-  /* Clear the list of deferred functions.  */
-  void (*clear_deferred_fns) (void);
+  /* Reset the parsing state for the next module.  */
+  void (*reset_parsing_state) (void);
 
   /* Return true if T is compiler generated.  */
   bool (*is_compiler_generated_type) (tree t);
diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
index 874696865b3..c55cefecb3a 100644
--- a/gcc/lra-constraints.c
+++ b/gcc/lra-constraints.c
@@ -317,6 +317,118 @@ in_mem_p (int regno)
   return get_reg_class (regno) == NO_REGS;
 }
 
+/* Return 1 if ADDR is a valid memory address for mode MODE in address
+   space AS, and check that each pseudo has the proper kind of hard
+   reg.	 */
+static int
+valid_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
+		 rtx addr, addr_space_t as)
+{
+#ifdef GO_IF_LEGITIMATE_ADDRESS
+  lra_assert (ADDR_SPACE_GENERIC_P (as));
+  GO_IF_LEGITIMATE_ADDRESS (mode, addr, win);
+  return 0;
+
+ win:
+  return 1;
+#else
+  return targetm.addr_space.legitimate_address_p (mode, addr, 0, as);
+#endif
+}
+
+namespace {
+  /* Temporarily eliminates registers in an address (for the lifetime of
+     the object).  */
+  class address_eliminator {
+  public:
+    address_eliminator (struct address_info *ad);
+    ~address_eliminator ();
+
+  private:
+    struct address_info *m_ad;
+    rtx *m_base_loc;
+    rtx m_base_reg;
+    rtx *m_index_loc;
+    rtx m_index_reg;
+  };
+}
+
+address_eliminator::address_eliminator (struct address_info *ad)
+  : m_ad (ad),
+    m_base_loc (strip_subreg (ad->base_term)),
+    m_base_reg (NULL_RTX),
+    m_index_loc (strip_subreg (ad->index_term)),
+    m_index_reg (NULL_RTX)
+{
+  if (m_base_loc != NULL)
+    {
+      m_base_reg = *m_base_loc;
+      lra_eliminate_reg_if_possible (m_base_loc);
+      if (m_ad->base_term2 != NULL)
+	*m_ad->base_term2 = *m_ad->base_term;
+    }
+  if (m_index_loc != NULL)
+    {
+      m_index_reg = *m_index_loc;
+      lra_eliminate_reg_if_possible (m_index_loc);
+    }
+}
+
+address_eliminator::~address_eliminator ()
+{
+  if (m_base_loc && *m_base_loc != m_base_reg)
+    {
+      *m_base_loc = m_base_reg;
+      if (m_ad->base_term2 != NULL)
+	*m_ad->base_term2 = *m_ad->base_term;
+    }
+  if (m_index_loc && *m_index_loc != m_index_reg)
+    *m_index_loc = m_index_reg;
+}
+
+/* Return true if the eliminated form of AD is a legitimate target address.  */
+static bool
+valid_address_p (struct address_info *ad)
+{
+  address_eliminator eliminator (ad);
+  return valid_address_p (ad->mode, *ad->outer, ad->as);
+}
+
+#ifdef EXTRA_CONSTRAINT_STR
+/* Return true if the eliminated form of memory reference OP satisfies
+   extra memory constraint CONSTRAINT.  */
+static bool
+satisfies_memory_constraint_p (rtx op, const char *constraint)
+{
+  struct address_info ad;
+
+  decompose_mem_address (&ad, op);
+  address_eliminator eliminator (&ad);
+  return EXTRA_CONSTRAINT_STR (op, *constraint, constraint);
+}
+
+/* Return true if the eliminated form of address AD satisfies extra
+   address constraint CONSTRAINT.  */
+static bool
+satisfies_address_constraint_p (struct address_info *ad,
+				const char *constraint)
+{
+  address_eliminator eliminator (ad);
+  return EXTRA_CONSTRAINT_STR (*ad->outer, *constraint, constraint);
+}
+
+/* Return true if the eliminated form of address OP satisfies extra
+   address constraint CONSTRAINT.  */
+static bool
+satisfies_address_constraint_p (rtx op, const char *constraint)
+{
+  struct address_info ad;
+
+  decompose_lea_address (&ad, &op);
+  return satisfies_address_constraint_p (&ad, constraint);
+}
+#endif
+
 /* Initiate equivalences for LRA.  As we keep original equivalences
    before any elimination, we need to make copies otherwise any change
    in insns might change the equivalences.  */
@@ -1959,7 +2071,8 @@ process_alt_operands (int only_alternative)
 #ifdef EXTRA_CONSTRAINT_STR
 		      if (EXTRA_MEMORY_CONSTRAINT (c, p))
 			{
-			  if (EXTRA_CONSTRAINT_STR (op, c, p))
+			  if (MEM_P (op)
+			      && satisfies_memory_constraint_p (op, p))
 			    win = true;
 			  else if (spilled_pseudo_p (op))
 			    win = true;
@@ -1978,7 +2091,7 @@ process_alt_operands (int only_alternative)
 			}
 		      if (EXTRA_ADDRESS_CONSTRAINT (c, p))
 			{
-			  if (EXTRA_CONSTRAINT_STR (op, c, p))
+			  if (satisfies_address_constraint_p (op, p))
 			    win = true;
 
 			  /* If we didn't already win, we can reload
@@ -2594,58 +2707,37 @@ process_alt_operands (int only_alternative)
   return ok_p;
 }
 
-/* Return 1 if ADDR is a valid memory address for mode MODE in address
-   space AS, and check that each pseudo has the proper kind of hard
-   reg.	 */
-static int
-valid_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
-		 rtx addr, addr_space_t as)
+/* Make reload base reg from address AD.  */
+static rtx
+base_to_reg (struct address_info *ad)
 {
-#ifdef GO_IF_LEGITIMATE_ADDRESS
-  lra_assert (ADDR_SPACE_GENERIC_P (as));
-  GO_IF_LEGITIMATE_ADDRESS (mode, addr, win);
-  return 0;
-
- win:
-  return 1;
-#else
-  return targetm.addr_space.legitimate_address_p (mode, addr, 0, as);
-#endif
-}
-
-/* Return whether address AD is valid.  */
+  enum reg_class cl;
+  int code = -1;
+  rtx new_inner = NULL_RTX;
+  rtx new_reg = NULL_RTX;
+  rtx insn;
+  rtx last_insn = get_last_insn();
 
-static bool
-valid_address_p (struct address_info *ad)
-{
-  /* Some ports do not check displacements for eliminable registers,
-     so we replace them temporarily with the elimination target.  */
-  rtx saved_base_reg = NULL_RTX;
-  rtx saved_index_reg = NULL_RTX;
-  rtx *base_term = strip_subreg (ad->base_term);
-  rtx *index_term = strip_subreg (ad->index_term);
-  if (base_term != NULL)
-    {
-      saved_base_reg = *base_term;
-      lra_eliminate_reg_if_possible (base_term);
-      if (ad->base_term2 != NULL)
-	*ad->base_term2 = *ad->base_term;
-    }
-  if (index_term != NULL)
-    {
-      saved_index_reg = *index_term;
-      lra_eliminate_reg_if_possible (index_term);
-    }
-  bool ok_p = valid_address_p (ad->mode, *ad->outer, ad->as);
-  if (saved_base_reg != NULL_RTX)
+  lra_assert (ad->base == ad->base_term && ad->disp == ad->disp_term);
+  cl = base_reg_class (ad->mode, ad->as, ad->base_outer_code,
+                       get_index_code (ad));
+  new_reg = lra_create_new_reg (GET_MODE (*ad->base_term), NULL_RTX,
+                                cl, "base");
+  new_inner = simplify_gen_binary (PLUS, GET_MODE (new_reg), new_reg,
+                                   ad->disp_term == NULL
+                                   ? gen_int_mode (0, ad->mode)
+                                   : *ad->disp_term);
+  if (!valid_address_p (ad->mode, new_inner, ad->as))
+    return NULL_RTX;
+  insn = emit_insn (gen_rtx_SET (ad->mode, new_reg, *ad->base_term));
+  code = recog_memoized (insn);
+  if (code < 0)
     {
-      *base_term = saved_base_reg;
-      if (ad->base_term2 != NULL)
-	*ad->base_term2 = *ad->base_term;
+      delete_insns_since (last_insn);
+      return NULL_RTX;
     }
-  if (saved_index_reg != NULL_RTX)
-    *index_term = saved_index_reg;
-  return ok_p;
+
+  return new_inner;
 }
 
 /* Make reload base reg + disp from address AD.  Return the new pseudo.  */
@@ -2855,7 +2947,7 @@ process_address_1 (int nop, rtx *before, rtx *after)
      EXTRA_CONSTRAINT_STR for the validation.  */
   if (constraint[0] != 'p'
       && EXTRA_ADDRESS_CONSTRAINT (constraint[0], constraint)
-      && EXTRA_CONSTRAINT_STR (op, constraint[0], constraint))
+      && satisfies_address_constraint_p (&ad, constraint))
     return change_p;
 #endif
 
@@ -2870,6 +2962,8 @@ process_address_1 (int nop, rtx *before, rtx *after)
 
      3) the address is a frame address with an invalid offset.
 
+     4) the address is a frame address with an invalid base.
+
      All these cases involve a non-autoinc address, so there is no
      point revalidating other types.  */
   if (ad.autoinc_p || valid_address_p (&ad))
@@ -2951,14 +3045,19 @@ process_address_1 (int nop, rtx *before, rtx *after)
       int regno;
       enum reg_class cl;
       rtx set, insns, last_insn;
+      /* Try to reload base into register only if the base is invalid
+         for the address but with valid offset, case (4) above.  */
+      start_sequence ();
+      new_reg = base_to_reg (&ad);
+
       /* base + disp => new base, cases (1) and (3) above.  */
       /* Another option would be to reload the displacement into an
 	 index register.  However, postreload has code to optimize
 	 address reloads that have the same base and different
 	 displacements, so reloading into an index register would
 	 not necessarily be a win.  */
-      start_sequence ();
-      new_reg = base_plus_disp_to_reg (&ad);
+      if (new_reg == NULL_RTX)
+        new_reg = base_plus_disp_to_reg (&ad);
       insns = get_insns ();
       last_insn = get_last_insn ();
       /* If we generated at least two insns, try last insn source as
@@ -3581,7 +3680,7 @@ curr_insn_transform (void)
 		  break;
 #ifdef EXTRA_CONSTRAINT_STR
 		if (EXTRA_MEMORY_CONSTRAINT (c, constraint)
-		    && EXTRA_CONSTRAINT_STR (tem, c, constraint))
+		    && satisfies_memory_constraint_p (tem, constraint))
 		  break;
 #endif
 	      }
@@ -3699,6 +3798,7 @@ curr_insn_transform (void)
 				  (ira_class_hard_regs[goal_alt[i]][0],
 				   GET_MODE (reg), byte, mode) >= 0)))))
 		{
+		  type = OP_INOUT;
 		  loc = &SUBREG_REG (*loc);
 		  mode = GET_MODE (*loc);
 		}
diff --git a/gcc/lra-lives.c b/gcc/lra-lives.c
index 8444adee6d8..9e8435e409e 100644
--- a/gcc/lra-lives.c
+++ b/gcc/lra-lives.c
@@ -665,9 +665,9 @@ process_bb_lives (basic_block bb, int &curr_point)
       /* Mark early clobber outputs dead.  */
       for (reg = curr_id->regs; reg != NULL; reg = reg->next)
 	if (reg->type == OP_OUT && reg->early_clobber && ! reg->subreg_p)
-	  need_curr_point_incr = mark_regno_dead (reg->regno,
-						  reg->biggest_mode,
-						  curr_point);
+	  need_curr_point_incr |= mark_regno_dead (reg->regno,
+						   reg->biggest_mode,
+						   curr_point);
 
       for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
 	if (reg->type == OP_OUT && reg->early_clobber && ! reg->subreg_p)
diff --git a/gcc/opts-global.c b/gcc/opts-global.c
index 425c3c0b28f..e83766034c0 100644
--- a/gcc/opts-global.c
+++ b/gcc/opts-global.c
@@ -299,7 +299,9 @@ lipo_save_cl_args (struct cl_decoded_option *decoded)
    the results of processing DECODED_OPTIONS and DECODED_OPTIONS_COUNT
    in OPTS and OPTS_SET and using DC for diagnostic state.  LANG_MASK
    contains has a single bit set representing the current language.
-   HANDLERS describes what functions to call for the options.  */
+   HANDLERS describes what functions to call for the options.
+   If COMMAND_LINE is true, this is being invoked for file level command
+   line options, otherwise for an optimize pragma or function attribute.  */
 
 static void
 read_cmdline_options (struct gcc_options *opts, struct gcc_options *opts_set,
@@ -308,7 +310,8 @@ read_cmdline_options (struct gcc_options *opts, struct gcc_options *opts_set,
 		      location_t loc,
 		      unsigned int lang_mask,
 		      const struct cl_option_handlers *handlers,
-		      diagnostic_context *dc)
+		      diagnostic_context *dc,
+                      bool command_line)
 {
   unsigned int i;
   int force_multi_module = 0;
@@ -341,7 +344,8 @@ read_cmdline_options (struct gcc_options *opts, struct gcc_options *opts_set,
       read_cmdline_option (opts, opts_set,
 			   decoded_options + i, loc, lang_mask, handlers,
 			   dc);
-      lipo_save_cl_args (decoded_options + i);
+      if (command_line)
+        lipo_save_cl_args (decoded_options + i);
     }
 }
 
@@ -393,12 +397,14 @@ set_default_handlers (struct cl_option_handlers *handlers)
 /* Parse command line options and set default flag values.  Do minimal
    options processing.  The decoded options are in *DECODED_OPTIONS
    and *DECODED_OPTIONS_COUNT; settings go in OPTS, OPTS_SET and DC;
-   the options are located at LOC.  */
+   the options are located at LOC.  If COMMAND_LINE is true, this is
+   being invoked for file level command line options, otherwise for
+   an optimize pragma or function attribute.  */
 void
 decode_options (struct gcc_options *opts, struct gcc_options *opts_set,
 		struct cl_decoded_option *decoded_options,
 		unsigned int decoded_options_count,
-		location_t loc, diagnostic_context *dc)
+		location_t loc, diagnostic_context *dc, bool command_line)
 {
   struct cl_option_handlers handlers;
 
@@ -415,7 +421,7 @@ decode_options (struct gcc_options *opts, struct gcc_options *opts_set,
   read_cmdline_options (opts, opts_set,
 			decoded_options, decoded_options_count,
 			loc, lang_mask,
-			&handlers, dc);
+			&handlers, dc, command_line);
 
   finish_options (opts, opts_set, loc);
 }
diff --git a/gcc/opts.c b/gcc/opts.c
index ac589412c3b..e999ed02cd0 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -734,13 +734,15 @@ finish_options (struct gcc_options *opts, struct gcc_options *opts_set,
      sections of the .o and executable files does not work (currently)
      with exception handling.  This is because there is no support for
      generating unwind info.  If opts->x_flag_exceptions is turned on
-     we need to turn off the partitioning optimization.  */
+     we need to turn off the partitioning optimization.
+     Enforcing this for DWARF2 based unwinding too because it could lead
+     to segfault.  */
 
   ui_except = targetm_common.except_unwind_info (opts);
 
   if (opts->x_flag_exceptions
       && opts->x_flag_reorder_blocks_and_partition
-      && (ui_except == UI_SJLJ || ui_except >= UI_TARGET))
+      && (ui_except >= UI_SJLJ))
     {
       if (opts_set->x_flag_reorder_blocks_and_partition)
         inform (loc,
diff --git a/gcc/opts.h b/gcc/opts.h
index 790be537514..7caed791e10 100644
--- a/gcc/opts.h
+++ b/gcc/opts.h
@@ -344,7 +344,8 @@ extern void decode_options (struct gcc_options *opts,
 			    struct cl_decoded_option *decoded_options,
 			    unsigned int decoded_options_count,
 			    location_t loc,
-			    diagnostic_context *dc);
+			    diagnostic_context *dc,
+                            bool command_line);
 extern int option_enabled (int opt_idx, void *opts);
 extern bool get_option_state (struct gcc_options *, int,
 			      struct cl_option_state *);
diff --git a/gcc/params.def b/gcc/params.def
index b34013ccc82..3d2c913fd26 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -343,7 +343,7 @@ DEFPARAM(PARAM_MAX_PEEL_BRANCHES,
 DEFPARAM(PARAM_MAX_COMPLETELY_PEELED_INSNS,
 	"max-completely-peeled-insns",
 	"The maximum number of insns of a completely peeled loop",
-	100, 0, 0)
+	200, 0, 0)
 /* The default maximum number of insns of a peeled loop, with -O2.  */
 DEFPARAM(PARAM_MAX_DEFAULT_COMPLETELY_PEELED_INSNS,
 	"max-default-completely-peeled-insns",
@@ -960,7 +960,7 @@ DEFPARAM (PARAM_ICALL_PROMOTE_PERCENT_THRESHOLD,
 	  "icall-promote-target-percent-threshold",
 	  "percentage threshold for direct call promotion"
           " of a callee target",
-	  33, 0, 100)
+	  30, 0, 100)
 
 DEFPARAM (PARAM_ICALL_PROMOTE_COUNT_THRESHOLD,
 	  "icall-promote-target_count-threshold",
@@ -1111,6 +1111,11 @@ DEFPARAM (PARAM_PROFILE_GENERATE_SAMPLING_PERIOD,
          "sampling rate with -fprofile-generate-sampling",
          100, 0, 2000000000)
 
+DEFPARAM (PARAM_LIPO_SAMPLING_PERIOD,
+         "lipo-sampling-period",
+         "sampling rate for lipo direct call and indirect call profile",
+         79, 0, 2000000000)
+
 DEFPARAM (PARAM_PROFILE_VALUES_TIME,
          "profile-values-time",
          "Enable time profiling when value profiling",
diff --git a/gcc/regcprop.c b/gcc/regcprop.c
index fd5fb1d2bf9..a1c7b9ddbfc 100644
--- a/gcc/regcprop.c
+++ b/gcc/regcprop.c
@@ -1011,7 +1011,6 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
 	  unsigned int set_nregs = 0;
 	  unsigned int regno;
 	  rtx exp;
-	  hard_reg_set_iterator hrsi;
 
 	  for (exp = CALL_INSN_FUNCTION_USAGE (insn); exp; exp = XEXP (exp, 1))
 	    {
@@ -1030,8 +1029,10 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
 		}
 	    }
 
-	  EXECUTE_IF_SET_IN_HARD_REG_SET (regs_invalidated_by_call, 0, regno, hrsi)
-	    if (regno < set_regno || regno >= set_regno + set_nregs)
+	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+	    if ((TEST_HARD_REG_BIT (regs_invalidated_by_call, regno)
+		 || HARD_REGNO_CALL_PART_CLOBBERED (regno, vd->e[regno].mode))
+		&& (regno < set_regno || regno >= set_regno + set_nregs))
 	      kill_value_regno (regno, 1, vd);
 
 	  /* If SET was seen in CALL_INSN_FUNCTION_USAGE, and SET_SRC
diff --git a/gcc/simplify-got.c b/gcc/simplify-got.c
index 61a8fe89370..7d46b2683b2 100644
--- a/gcc/simplify-got.c
+++ b/gcc/simplify-got.c
@@ -169,7 +169,7 @@ rest_of_handle_simplify_got (void)
 
   /* Since there is no usage of pic_reg now, we can remove it.  */
   if (use)
-    remove_insn (use);
+    delete_insn (use);
   targetm.got_access.clear_pic_reg ();
   free (got_accesses);
   htab_delete (var_table);
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index c64de3178fa..b98e507628d 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -503,6 +503,12 @@ simplify_replace_fn_rtx (rtx x, const_rtx old_rtx,
 	  if (GET_CODE (op0) == HIGH && rtx_equal_p (XEXP (op0, 0), op1))
 	    return op1;
 
+	  /* (lo_sum (high x) (const (plus x ofs))) -> (const (plus x ofs))  */
+	  if (GET_CODE (op0) == HIGH && GET_CODE (op1) == CONST
+	      && GET_CODE(XEXP (op1, 0)) == PLUS
+	      && rtx_equal_p (XEXP (XEXP (op1, 0), 0), XEXP (op0, 0)))
+	    return op1;
+
 	  if (op0 == XEXP (x, 0) && op1 == XEXP (x, 1))
 	    return x;
 	  return gen_rtx_LO_SUM (mode, op0, op1);
diff --git a/gcc/system.h b/gcc/system.h
index 42bc509f2cd..40e3cfa07a5 100644
--- a/gcc/system.h
+++ b/gcc/system.h
@@ -928,6 +928,7 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
 	TARGET_HANDLE_PRAGMA_EXTERN_PREFIX \
 	TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN \
 	TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD \
+	TARGET_RELAXED_ORDERING
 
 /* Arrays that were deleted in favor of a functional interface.  */
  #pragma GCC poison built_in_decls implicit_built_in_decls
diff --git a/gcc/target.def b/gcc/target.def
index 89c2c637621..5dab85edde3 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3218,6 +3218,17 @@ If not defined, the default is to return @code{NULL_RTX}.",
  rtx, (rtx reg),
  hook_rtx_rtx_null)
 
+/* Given a register return the mode of the corresponding DWARF frame
+   register.  */
+DEFHOOK
+(dwarf_frame_reg_mode,
+ "Given a register, this hook should return the mode which the\n\
+corresponding Dwarf frame register should have.  This is normally\n\
+used to return a smaller mode than the raw mode to prevent call\n\
+clobbered parts of a register altering the frame register size",
+ enum machine_mode, (int regno),
+ default_dwarf_frame_reg_mode)
+
 /* If expand_builtin_init_dwarf_reg_sizes needs to fill in table
    entries not corresponding directly to registers below
    FIRST_PSEUDO_REGISTER, this hook should generate the necessary
@@ -5359,19 +5370,6 @@ for the primary source file, immediately after printing\n\
 this to be done.  The default is false.",
  bool, false)
 
-/* True if the target is allowed to reorder memory accesses unless
-   synchronization is explicitly requested.  */
-DEFHOOKPOD
-(relaxed_ordering,
- "If set to @code{true}, means that the target's memory model does not\n\
-guarantee that loads which do not depend on one another will access\n\
-main memory in the order of the instruction stream; if ordering is\n\
-important, an explicit memory barrier must be used.  This is true of\n\
-many recent processors which implement a policy of ``relaxed,''\n\
-``weak,'' or ``release'' memory consistency, such as Alpha, PowerPC,\n\
-and ia64.  The default is @code{false}.",
- bool, false)
-
 /* Returns true if we should generate exception tables for use with the
    ARM EABI.  The effects the encoding of function exception specifications.  */
 DEFHOOKPOD
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index ff5c06221fb..4721186e5a8 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1444,6 +1444,19 @@ default_debug_unwind_info (void)
   return UI_NONE;
 }
 
+/* Determine the correct mode for a Dwarf frame register that represents
+   register REGNO.  */
+
+enum machine_mode
+default_dwarf_frame_reg_mode (int regno)
+{
+  enum machine_mode save_mode = reg_raw_mode[regno];
+
+  if (HARD_REGNO_CALL_PART_CLOBBERED (regno, save_mode))
+    save_mode = choose_hard_reg_mode (regno, 1, true);
+  return save_mode;
+}
+
 /* To be used by targets where reg_raw_mode doesn't return the right
    mode for registers used in apply_builtin_return and apply_builtin_arg.  */
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 516e14a48a5..6a759205c44 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -195,6 +195,7 @@ extern int default_label_align_max_skip (rtx);
 extern int default_jump_align_max_skip (rtx);
 extern section * default_function_section(tree decl, enum node_frequency freq,
 					  bool startup, bool exit);
+extern enum machine_mode default_dwarf_frame_reg_mode (int);
 extern enum machine_mode default_get_reg_raw_mode (int);
 
 extern void *default_get_pch_validity (size_t *);
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 225de8f7a3e..4276996d6c6 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,24 @@
+2015-03-26  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
+
+	Backport r214254 and related tests from mainline
+	* gcc.target/powerpc/swaps-p8-1.c: New test.
+	* gcc.target/powerpc/swaps-p8-2.c: New test.
+	* gcc.target/powerpc/swaps-p8-3.c: New test.
+	* gcc.target/powerpc/swaps-p8-4.c: New test.
+	* gcc.target/powerpc/swaps-p8-5.c: New test.
+	* gcc.target/powerpc/swaps-p8-6.c: New test.
+	* gcc.target/powerpc/swaps-p8-7.c: New test.
+	* gcc.target/powerpc/swaps-p8-8.c: New test.
+	* gcc.target/powerpc/swaps-p8-9.c: New test.
+	* gcc.target/powerpc/swaps-p8-10.c: New test.
+	* gcc.target/powerpc/swaps-p8-11.c: New test.
+	* gcc.target/powerpc/swaps-p8-12.c: New test.
+	* gcc.target/powerpc/swaps-p8-13.c: New test.
+	* gcc.target/powerpc/swaps-p8-14.c: New test.
+	* gcc.target/powerpc/swaps-p8-15.c: New test.
+	* gcc.target/powerpc/swaps-p8-16.c: New test.
+	* gcc.target/powerpc/swaps-p8-17.c: New test.
+
 2015-01-20  Marek Polacek  <polacek@redhat.com>
 
 	Backport from mainline
diff --git a/gcc/testsuite/g++.dg/abi/aarch64_guard1.C b/gcc/testsuite/g++.dg/abi/aarch64_guard1.C
index ca1778b8730..e2669a89fbf 100644
--- a/gcc/testsuite/g++.dg/abi/aarch64_guard1.C
+++ b/gcc/testsuite/g++.dg/abi/aarch64_guard1.C
@@ -13,5 +13,4 @@ int *foo ()
 }
 
 // { dg-final { scan-assembler _ZGVZ3foovE1x,8,8 } }
-// { dg-final { scan-tree-dump "_ZGVZ3foovE1x & 1" "original" } }
-// { dg-final { cleanup-tree-dump "original" } }
+// { dg-final { scan-tree-dump "& 1" "original" } }
diff --git a/gcc/testsuite/g++.dg/ext/mv18.C b/gcc/testsuite/g++.dg/ext/mv18.C
new file mode 100644
index 00000000000..1f024de9b95
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv18.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target pie } */
+/* { dg-options "-O2 -fPIE -pie" } */
+
+#include "mv1.C"
diff --git a/gcc/testsuite/g++.dg/ext/mv19.C b/gcc/testsuite/g++.dg/ext/mv19.C
new file mode 100644
index 00000000000..d1ea788745f
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv19.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target pie } */
+/* { dg-options "-O2 -fPIE -pie -march=x86-64" } */
+
+#include "mv14.C"
diff --git a/gcc/testsuite/g++.dg/ext/mv20.C b/gcc/testsuite/g++.dg/ext/mv20.C
new file mode 100644
index 00000000000..98f7408e1fc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv20.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target pie } */
+/* { dg-options "-O2 -fPIE -pie -march=x86-64" } */
+
+#include "mv15.C"
diff --git a/gcc/testsuite/g++.dg/ext/mv21.C b/gcc/testsuite/g++.dg/ext/mv21.C
new file mode 100644
index 00000000000..9708ad95019
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv21.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target static } */
+/* { dg-options "-O2 -static" } */
+
+#include "mv1.C"
diff --git a/gcc/testsuite/g++.dg/ext/mv22.C b/gcc/testsuite/g++.dg/ext/mv22.C
new file mode 100644
index 00000000000..2550136fdac
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv22.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target static } */
+/* { dg-options "-O2 -static -march=x86-64" } */
+
+#include "mv14.C"
diff --git a/gcc/testsuite/g++.dg/ext/mv23.C b/gcc/testsuite/g++.dg/ext/mv23.C
new file mode 100644
index 00000000000..f00afb01f15
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/mv23.C
@@ -0,0 +1,7 @@
+/* Test case to check if Multiversioning works.  */
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-ifunc "" }  */
+/* { dg-require-effective-target static } */
+/* { dg-options "-O2 -static -march=x86-64" } */
+
+#include "mv15.C"
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/indir-call-prof_0.C b/gcc/testsuite/g++.dg/tree-prof/lipo/indir-call-prof_0.C
index b34b937fda0..78fd0c58382 100644
--- a/gcc/testsuite/g++.dg/tree-prof/lipo/indir-call-prof_0.C
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/indir-call-prof_0.C
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile --param=lipo-sampling-period=1" } */
 
 struct A {
   A () {}
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls.h b/gcc/testsuite/g++.dg/tree-prof/lipo/tls.h
new file mode 100644
index 00000000000..5eb102d74ce
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls.h
@@ -0,0 +1,16 @@
+extern int NextId();
+
+class TLSClass {
+ public:
+  TLSClass() {
+    id = NextId();
+    bar = 1;
+  }
+  ~TLSClass() {}
+  int id;
+  int bar;
+};
+extern TLSClass* NextTLSClass();
+extern void *SetTLSClass(TLSClass *a);
+extern TLSClass *GetTLSClass();
+extern thread_local TLSClass* current_tls_;
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls2.h b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2.h
new file mode 100644
index 00000000000..9ba0945c2a6
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2.h
@@ -0,0 +1,15 @@
+extern int NextId();
+
+class TLSClass {
+ public:
+  TLSClass() {
+    id = NextId();
+    bar = 1;
+  }
+  ~TLSClass() {}
+  int id;
+  int bar;
+};
+extern TLSClass* NextTLSClass();
+extern void *SetTLSClass(TLSClass *a);
+extern TLSClass *GetTLSClass();
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_0.C b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_0.C
new file mode 100644
index 00000000000..9ccd5b96b02
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_0.C
@@ -0,0 +1,10 @@
+// { dg-options "-std=c++11 -O2 --param=lipo-sampling-period=1" }
+#include "tls2.h"
+
+static thread_local TLSClass* current_tls_ = NextTLSClass();
+void *SetTLSClass(TLSClass *a) {
+  current_tls_ = a;
+}
+TLSClass *GetTLSClass() {
+  return current_tls_;
+}
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_1.C b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_1.C
new file mode 100644
index 00000000000..43c10852688
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls2_1.C
@@ -0,0 +1,31 @@
+// { dg-options "-std=c++11 -O2 --param=lipo-sampling-period=1" }
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+#include "tls2.h"
+TLSClass* NextTLSClass() {
+  return new TLSClass();
+}
+int NextId() {
+  static int id = 0;
+  return id++;
+}
+static thread_local TLSClass* current_tls2_ = NextTLSClass();
+void *SetTLSClass2(TLSClass *a) {
+  current_tls2_ = a;
+}
+int main() {
+  int i = 0;
+  if (GetTLSClass()->id != i++)
+    abort();
+  TLSClass *A = NextTLSClass();
+  SetTLSClass(A);
+  if (GetTLSClass()->id != i++)
+    abort();
+  if (current_tls2_->id != i++)
+    abort();
+  A = NextTLSClass();
+  SetTLSClass2(A);
+  if (current_tls2_->id != i++)
+    abort();
+}
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls_0.C b/gcc/testsuite/g++.dg/tree-prof/lipo/tls_0.C
new file mode 100644
index 00000000000..f3e5e376e42
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls_0.C
@@ -0,0 +1,10 @@
+// { dg-options "-std=c++11 -O2 --param=lipo-sampling-period=1" }
+#include "tls.h"
+
+thread_local TLSClass* current_tls_ = NextTLSClass();
+void *SetTLSClass(TLSClass *a) {
+  current_tls_ = a;
+}
+TLSClass *GetTLSClass() {
+  return current_tls_;
+}
diff --git a/gcc/testsuite/g++.dg/tree-prof/lipo/tls_1.C b/gcc/testsuite/g++.dg/tree-prof/lipo/tls_1.C
new file mode 100644
index 00000000000..fcf8c210167
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-prof/lipo/tls_1.C
@@ -0,0 +1,38 @@
+// { dg-options "-std=c++11 -O2 --param=lipo-sampling-period=1" }
+#include <stdio.h>
+#include <stdlib.h>
+#include <new>
+#include "tls.h"
+TLSClass* NextTLSClass() {
+  return new TLSClass();
+}
+int NextId() {
+  static int id = 0;
+  return id++;
+}
+void *SetTLSClassHere(TLSClass *a) {
+  current_tls_ = a;
+}
+thread_local TLSClass* current_tls2_ = NextTLSClass();
+void *SetTLSClass2(TLSClass *a) {
+  current_tls2_ = a;
+}
+int main() {
+  int i = 0;
+  if (GetTLSClass()->id != i++)
+    abort();
+  TLSClass *A = NextTLSClass();
+  SetTLSClass(A);
+  if (GetTLSClass()->id != i++)
+    abort();
+  A = NextTLSClass();
+  SetTLSClassHere(A);
+  if (GetTLSClass()->id != i++)
+    abort();
+  if (current_tls2_->id != i++)
+    abort();
+  A = NextTLSClass();
+  SetTLSClass2(A);
+  if (current_tls2_->id != i++)
+    abort();
+}
diff --git a/gcc/testsuite/g++.dg/warn/Warray-bounds-6.C b/gcc/testsuite/g++.dg/warn/Warray-bounds-6.C
new file mode 100644
index 00000000000..f2e5f2f597a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/warn/Warray-bounds-6.C
@@ -0,0 +1,26 @@
+// { dg-do compile }
+// { dg-options "-O3 -Warray-bounds" }
+
+struct type {
+    bool a, b;
+    bool get_b() { return b; }
+};
+
+type stuff[9u];
+
+void bar();
+
+void foo()
+{
+  for(unsigned i = 0u; i < 9u; i++)
+    {
+      if(!stuff[i].a)
+	continue;
+
+      bar();
+
+      for(unsigned j = i + 1u; j < 9u; j++)
+	if(stuff[j].a && stuff[j].get_b()) // { dg-bogus "above array bounds" }
+	  return;
+    }
+}
diff --git a/gcc/testsuite/gcc.dg/19277289.c b/gcc/testsuite/gcc.dg/19277289.c
new file mode 100644
index 00000000000..9f7703e37cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/19277289.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -w" } */
+
+struct xktime {
+  int tv;
+};
+typedef struct xktime xktime_t;
+
+__attribute__((always_inline)) xktime_t xktime_set(void)
+{
+  return (xktime_t) { .tv = 0 };
+}
+
+#pragma GCC optimize ("O0")
+
+extern void foo (xktime_t);
+
+void LCD_WaitTE(void)
+{
+  foo (xktime_set());
+}
diff --git a/gcc/testsuite/gcc.dg/Warray-bounds-12.c b/gcc/testsuite/gcc.dg/Warray-bounds-12.c
new file mode 100644
index 00000000000..ef26c6596bf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Warray-bounds-12.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -Warray-bounds" } */
+/* { dg-additional-options "-mssse3" { target x86_64-*-* i?86-*-* } } */
+
+void foo(short a[], short m)
+{
+  int i, j;
+  int f1[10];
+  short nc;
+
+  nc = m + 1;
+  if (nc > 3)
+    {
+      for (i = 0; i <= nc; i++)
+	{
+	  f1[i] = f1[i] + 1;
+	}
+    }
+
+  for (i = 0, j = m; i < nc; i++, j--)
+    {
+      a[i] = f1[i]; /* { dg-bogus "above array bounds" } */
+      a[j] = i;
+    }
+  return;
+}
diff --git a/gcc/testsuite/gcc.dg/Warray-bounds-13.c b/gcc/testsuite/gcc.dg/Warray-bounds-13.c
new file mode 100644
index 00000000000..7b40a83887d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Warray-bounds-13.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -Warray-bounds" } */
+
+extern char *bar[17];
+
+int foo(int argc, char **argv)
+{
+  int i;
+  int n = 0;
+
+  for (i = 0; i < argc; i++)
+    n++;
+
+  for (i = 0; i < argc; i++)
+    argv[i] = bar[i + n]; /* { dg-bogus "above array bounds" } */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/memcpy-4.c b/gcc/testsuite/gcc.dg/memcpy-4.c
index cff477a981e..b17b369c5c6 100644
--- a/gcc/testsuite/gcc.dg/memcpy-4.c
+++ b/gcc/testsuite/gcc.dg/memcpy-4.c
@@ -1,14 +1,8 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-rtl-expand" } */
+/* { dg-options "-O2" } */
 
-#ifdef __mips
-__attribute__((nomips16))
-#endif
 void
 f1 (char *p)
 {
   __builtin_memcpy (p, "12345", 5);
 }
-
-/* { dg-final { scan-rtl-dump "mem/u.*mem/u" "expand" { target mips*-*-* } } } */
-/* { dg-final { cleanup-rtl-dump "expand" } } */
diff --git a/gcc/testsuite/gcc.dg/pr64277.c b/gcc/testsuite/gcc.dg/pr64277.c
new file mode 100644
index 00000000000..c6ef33119a7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr64277.c
@@ -0,0 +1,23 @@
+/* PR tree-optimization/64277 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -Wall -Werror -fdump-tree-cunroll-details" } */
+/* { dg-final { scan-tree-dump "loop with 5 iterations completely unrolled" "cunroll" } } */
+/* { dg-final { scan-tree-dump "loop with 6 iterations completely unrolled" "cunroll" } } */
+/* { dg-final { cleanup-tree-dump "cunroll" } } */
+
+int f1[10];
+void test1 (short a[], short m, unsigned short l)
+{
+  int i = l;
+  for (i = i + 5; i < m; i++)
+    f1[i] = a[i]++;
+}
+
+void test2 (short a[], short m, short l)
+{
+  int i;
+  if (m > 5)
+    m = 5;
+  for (i = m; i > l; i--)
+    f1[i] = a[i]++;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/mips-hilo-2.c b/gcc/testsuite/gcc.dg/torture/mips-hilo-2.c
index dbe949307df..78f7710d67b 100644
--- a/gcc/testsuite/gcc.dg/torture/mips-hilo-2.c
+++ b/gcc/testsuite/gcc.dg/torture/mips-hilo-2.c
@@ -5,6 +5,7 @@
 extern void abort (void);
 extern void exit (int);
 
+#if __mips_isa_rev <= 5
 unsigned int g;
 
 unsigned __attribute__ ((nomips16)) long long f (unsigned int x)
@@ -15,13 +16,16 @@ unsigned __attribute__ ((nomips16)) long long f (unsigned int x)
   asm ("mflo\t%0" : "=r" (g) : "l" (u.parts[1]));
   return u.ll;
 }
+#endif
 
 int __attribute__ ((nomips16)) main ()
 {
+#if __mips_isa_rev <= 5
   union { unsigned long long ll; unsigned int parts[2]; } u;
 
   u.ll = f (0x12345678);
   if (g != u.parts[1])
     abort ();
+#endif
   exit (0);
 }
diff --git a/gcc/testsuite/gcc.dg/torture/pr19683-1.c b/gcc/testsuite/gcc.dg/torture/pr19683-1.c
index 05bf174183b..aa7205f7c6f 100644
--- a/gcc/testsuite/gcc.dg/torture/pr19683-1.c
+++ b/gcc/testsuite/gcc.dg/torture/pr19683-1.c
@@ -14,6 +14,7 @@ extern void exit (int);
 #define IN(X) unsigned int x##X = ptr[0]
 #define OUT(X) ptr[0] = x##X
 
+#if __mips_isa_rev <= 5
 union u { unsigned long long ll; unsigned int i[2]; };
 
 unsigned int __attribute__ ((nomips16))
@@ -28,15 +29,18 @@ foo (volatile unsigned int *ptr)
   asm ("#" : "=l" (result) : "l" (u.i[1]));
   return result;
 }
+#endif
 
 int __attribute__ ((nomips16))
 main (void)
 {
+#if __mips_isa_rev <= 5
   unsigned int array[] = { 1000 * 1000 * 1000 };
   union u u;
 
   u.ll = (unsigned long long) array[0] * array[0];
   if (foo (array) != u.i[1])
     abort ();
+#endif
   exit (0);
 }
diff --git a/gcc/testsuite/gcc.dg/tree-prof/cold_partition_label.c b/gcc/testsuite/gcc.dg/tree-prof/cold_partition_label.c
index a03aad7f6d8..643707f75f8 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/cold_partition_label.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/cold_partition_label.c
@@ -35,4 +35,6 @@ main (int argc, char *argv[])
   return 0;
 }
 
+/* { dg-final-use { scan-assembler "foo\[._\]+cold\[\._\]+0" } } */
+/* { dg-final-use { scan-assembler "size\[ \ta-zA-Z0-0\]+foo\[._\]+cold\[\._\]+0" } } */
 /* { dg-final-use { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c b/gcc/testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c
index ff103a8c561..a5aeffb17da 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/lipo/ic-misattribution-1_0.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-ipa-profile" } */
+/* { dg-options "-O2 -fdump-ipa-profile --param=lipo-sampling-period=1" } */
 
 extern void callee (void);
 extern void caller (void (*func) (void));
diff --git a/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof-single_0.c b/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof-single_0.c
index 29a216c690b..8d78bb58940 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof-single_0.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof-single_0.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile --param=lipo-sampling-period=1" } */
 
 static int a1 (void)
 {
diff --git a/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c b/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c
index 6bd048df680..3b681b2e566 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/lipo/indir-call-prof_0.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile --param=lipo-sampling-period=3" } */
 
 extern void setp (int (**pp) (void), int i);
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/restrict-5.c b/gcc/testsuite/gcc.dg/tree-ssa/restrict-5.c
new file mode 100644
index 00000000000..d6c240aae03
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/restrict-5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-strict-aliasing -fdump-tree-lim1-details" } */
+
+static inline __attribute__((always_inline))
+void f(int * __restrict__ r,
+       int a[__restrict__ 16][16],
+       int b[__restrict__ 16][16],
+       int i, int j)
+{
+  int x;
+  *r = 0;
+  for (x = 1; x < 16; ++x)
+    *r = *r + a[i][x] * b[x][j];
+}
+
+void g(int *r, int a[16][16], int b[16][16], int i, int j)
+{
+  f (r, a, b, i ,j);
+}
+
+/* We should apply store motion to the store to *r.  */
+
+/* { dg-final { scan-tree-dump "Executing store motion of \\\*r" "lim1" } } */
+/* { dg-final { cleanup-tree-dump "lim1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
new file mode 100644
index 00000000000..6e3cb52b85d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-26.c b/gcc/testsuite/gcc.dg/vect/slp-26.c
index 09a1ecd9c42..2024af947cb 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-26.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-26.c
@@ -46,7 +46,9 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target !mips*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target mips*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target !mips*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target mips*-*-* } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/tree-vect.h b/gcc/testsuite/gcc.dg/vect/tree-vect.h
index ed59d7976b9..600ef6c6712 100644
--- a/gcc/testsuite/gcc.dg/vect/tree-vect.h
+++ b/gcc/testsuite/gcc.dg/vect/tree-vect.h
@@ -66,6 +66,8 @@ check_vect (void)
     if (a != 1)
       exit (0);
   }
+#elif defined(__mips)
+  asm volatile ("or.v $w0,$w0,$w0");
 #endif
   signal (SIGILL, SIG_DFL);
 }
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index e8d866b991c..207eb8f691e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -24,6 +24,10 @@ load_lib clearcap.exp
 global DEFAULT_VECTCFLAGS
 set DEFAULT_VECTCFLAGS ""
 
+# Set up additional flags for tests that require multiple runs.
+global MULTI_VECTCFLAGS
+set MULTI_VECTCFLAGS ""
+
 # If the target system supports vector instructions, the default action
 # for a test is 'run', otherwise it's 'compile'.  Save current default.
 # Executing vector instructions on a system without hardware vector support
@@ -39,6 +43,20 @@ if ![check_vect_support_and_set_flags] {
     return
 }
 
+proc dg-multi-runtest { testcases flags default-extra-flags } {
+    global MULTI_VECTCFLAGS
+    set DEFAULT_VECTCFLAGS ""
+
+    if { [llength $MULTI_VECTCFLAGS] > 0 } {
+        foreach extra_flags $MULTI_VECTCFLAGS {
+            set new_flags [string trim "$extra_flags $flags"]
+            dg-runtest $testcases $new_flags ${default-extra-flags}
+        }
+    } else {
+        dg-runtest $testcases $flags ${default-extra-flags}
+    }
+}
+
 # These flags are used for all targets.
 lappend DEFAULT_VECTCFLAGS "-ftree-vectorize" "-fno-vect-cost-model" "-fno-common"
 
@@ -62,12 +80,12 @@ lappend O_VECTCFLAGS "-fdump-tree-vect-details"
 lappend DEFAULT_VECTCFLAGS "-O2"
 
 # Tests that should be run without generating dump info
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/nodump-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/nodump-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # "-O -fdump-tree-veclower2"
 lappend VEC_FLAGS "-O" "-fdump-tree-veclower2"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vec-scal-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/vec-scal-*.\[cS\]]]  \
         "" $VEC_FLAGS
 
 set VECT_SLP_CFLAGS $DEFAULT_VECTCFLAGS
@@ -81,16 +99,17 @@ if { [check_effective_target_lto] } {
     lappend VECT_ADDITIONAL_FLAGS "-flto -ffat-lto-objects"
 }
 foreach flags $VECT_ADDITIONAL_FLAGS {
-    dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]]  \
-	$flags $DEFAULT_VECTCFLAGS
-    dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]]  \
-	$flags $DEFAULT_VECTCFLAGS
-    dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/slp-*.\[cS\]]]  \
+    dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]]  \
         $flags $DEFAULT_VECTCFLAGS
-    dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/bb-slp*.\[cS\]]]  \
+    dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]]  \
+        $flags $DEFAULT_VECTCFLAGS
+    dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/slp-*.\[cS\]]]  \
+        $flags $DEFAULT_VECTCFLAGS
+    dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/bb-slp*.\[cS\]]]  \
         $flags $VECT_SLP_CFLAGS
 }
 
+
 #### Tests with special options
 global SAVED_DEFAULT_VECTCFLAGS
 set SAVED_DEFAULT_VECTCFLAGS $DEFAULT_VECTCFLAGS
@@ -99,43 +118,43 @@ set SAVED_VECT_SLP_CFLAGS $VECT_SLP_CFLAGS
 # --param vect-max-version-for-alias-checks=0 tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "--param" "vect-max-version-for-alias-checks=0"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -ffast-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ffast-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-\[ipsv\]*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-\[ipsv\]*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -ffast-math SLP tests
 set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
 lappend VECT_SLP_CFLAGS "-ffast-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-bb-slp-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-bb-slp-*.\[cS\]]]  \
         "" $VECT_SLP_CFLAGS
 
 # -fno-fast-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-fast-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-fast-math-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-fast-math-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-math-errno tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-math-errno"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fwrapv tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fwrapv"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -ftrapv tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ftrapv"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fdump-tree-dceloop-details tests
@@ -147,98 +166,98 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dump-tree-dceloop-*.\[cS\]]]
 # -fno-tree-dce tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-dce"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dce-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dce-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fsection-anchors tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fsection-anchors"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/section-anchors-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/section-anchors-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # alignment-sensitive -fsection-anchors tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fsection-anchors" \
 	"-fdump-ipa-increase_alignment-details"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/aligned-section-anchors-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/aligned-section-anchors-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fno-section-anchors tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-section-anchors"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-section-anchors-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-section-anchors-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -funswitch-loops tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-funswitch-loops"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/unswitch-loops-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/unswitch-loops-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fno-trapping-math tests
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-trapping-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-trapping-math-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-trapping-math-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-scev-cprop
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-vect-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-vect-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-scev-cprop
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-pr*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-pr*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-scev-cprop
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-outer-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-outer-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-scev-cprop -fno-tree-reassoc
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" "-fno-tree-reassoc"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-noreassoc-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-noreassoc-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-scev-cprop
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-slp-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-slp-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-dominator-opts
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-dominator-opts"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dom-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dom-*.\[cS\]]]  \
 	"" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-pre
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-pre"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-pre-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-pre-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # With -Os
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-Os"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/Os-vect-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/Os-vect-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # With --param ggc-min-expand=0 --param ggc-min-heapsize=0 
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "--param" "ggc-min-expand=0" "--param" "ggc-min-heapsize=0"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/ggc-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/ggc-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -ftree-loop-if-convert-stores
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-ftree-loop-if-convert-stores"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/if-cvt-stores-vect-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/if-cvt-stores-vect-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # With -O3.
@@ -248,39 +267,39 @@ lappend DEFAULT_VECTCFLAGS "-O3" "-fno-ipa-cp-clone"
 if  [istarget "spu-*-*"] {
   lappend DEFAULT_VECTCFLAGS "-funroll-loops"
 }
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/O3-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/O3-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # With -O1
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/O1-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/O1-*.\[cS\]]]  \
         "" $O1_VECTCFLAGS
 
 # With -O
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/O-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/O-*.\[cS\]]]  \
         "" $O_VECTCFLAGS
 
 # -fno-tree-reassoc
 set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
 lappend VECT_SLP_CFLAGS "-fno-tree-reassoc"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-reassoc-bb-slp-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-reassoc-bb-slp-*.\[cS\]]]  \
         "" $VECT_SLP_CFLAGS
 
 # -fno-tree-fre
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-fre"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-fre-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-fre-*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-fre -fno-tree-pre
 set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
 lappend DEFAULT_VECTCFLAGS "-fno-tree-fre" "-fno-tree-pre"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-fre-pre*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-fre-pre*.\[cS\]]]  \
         "" $DEFAULT_VECTCFLAGS
 
 # -fno-tree-sra
 set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
 lappend VECT_SLP_CFLAGS "-fno-tree-sra"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-sra-bb-slp-*.\[cS\]]]  \
+dg-multi-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-sra-bb-slp-*.\[cS\]]]  \
         "" $VECT_SLP_CFLAGS
 
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sisd-shft-neg_1.c b/gcc/testsuite/gcc.target/aarch64/sisd-shft-neg_1.c
new file mode 100644
index 00000000000..c091657cb57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sisd-shft-neg_1.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+
+extern void abort (void);
+
+#define force_simd_si(v) asm volatile ("mov %s0, %1.s[0]" :"=w" (v) :"w" (v) :)
+
+unsigned int
+shft_add (unsigned int a, unsigned int b)
+{
+  unsigned int c;
+
+  force_simd_si (a);
+  force_simd_si (b);
+  c = a >> b;
+  force_simd_si (c);
+
+  return c + b;
+}
+
+int
+main (void)
+{
+  unsigned int i = 0;
+  unsigned int a = 0xdeadbeef;
+
+  for (i = 0; i < 32; i++)
+  {
+    unsigned int exp = (a / (1 << i) + i);
+    unsigned int got = shft_add (a, i);
+
+    if (exp != got)
+      abort ();
+  }
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi-2.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi-2.c
new file mode 100644
index 00000000000..34579d52695
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi-2.c
@@ -0,0 +1,67 @@
+/* PR target/65368 */
+/* { dg-do assemble { target bmi2 } } */
+/* { dg-options "-O2 -mbmi2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+unsigned int a;
+unsigned long long b;
+
+#define A __attribute__((noinline, noclone))
+
+A unsigned int f1 (void) { return _bzhi_u32 (a, 0); }
+A unsigned int f2 (unsigned int x) { return _bzhi_u32 (x, 0); }
+A unsigned int f3 (void) { return _bzhi_u32 (a, 5); }
+A unsigned int f4 (unsigned int x) { return _bzhi_u32 (x, 5); }
+A unsigned int f5 (void) { return _bzhi_u32 (a, 31); }
+A unsigned int f6 (unsigned int x) { return _bzhi_u32 (x, 31); }
+A unsigned int f7 (void) { return _bzhi_u32 (a, 32); }
+A unsigned int f8 (unsigned int x) { return _bzhi_u32 (x, 32); }
+A unsigned int f9 (void) { return _bzhi_u32 (a, 37); }
+A unsigned int f10 (unsigned int x) { return _bzhi_u32 (x, 37); }
+A unsigned int f11 (void) { return _bzhi_u32 (a, 257); }
+A unsigned int f12 (unsigned int x) { return _bzhi_u32 (x, 257); }
+A unsigned int f13 (void) { return _bzhi_u32 (a, 289); }
+A unsigned int f14 (unsigned int x) { return _bzhi_u32 (x, 289); }
+#ifdef __x86_64__
+A unsigned long long f21 (void) { return _bzhi_u64 (b, 0); }
+A unsigned long long f22 (unsigned long long x) { return _bzhi_u64 (x, 0); }
+A unsigned long long f23 (void) { return _bzhi_u64 (b, 5); }
+A unsigned long long f24 (unsigned long long x) { return _bzhi_u64 (x, 5); }
+A unsigned long long f25 (void) { return _bzhi_u64 (b, 63); }
+A unsigned long long f26 (unsigned long long x) { return _bzhi_u64 (x, 63); }
+A unsigned long long f27 (void) { return _bzhi_u64 (b, 64); }
+A unsigned long long f28 (unsigned long long x) { return _bzhi_u64 (x, 64); }
+A unsigned long long f29 (void) { return _bzhi_u64 (b, 69); }
+A unsigned long long f30 (unsigned long long x) { return _bzhi_u64 (x, 69); }
+A unsigned long long f31 (void) { return _bzhi_u64 (b, 257); }
+A unsigned long long f32 (unsigned long long x) { return _bzhi_u64 (x, 257); }
+A unsigned long long f33 (void) { return _bzhi_u64 (b, 321); }
+A unsigned long long f34 (unsigned long long x) { return _bzhi_u64 (x, 321); }
+#endif
+
+static void
+bmi2_test ()
+{
+  a = -1U;
+  b = -1ULL;
+  if (f1 () != 0 || f2 (-1U) != 0
+      || f3 () != 0x1f || f4 (-1U) != 0x1f
+      || f5 () != 0x7fffffffU || f6 (-1U) != 0x7fffffffU
+      || f7 () != -1U || f8 (-1U) != -1U
+      || f9 () != -1U || f10 (-1U) != -1U
+      || f11 () != 1 || f12 (-1U) != 1
+      || f13 () != -1U || f14 (-1U) != -1U)
+    abort ();
+#ifdef __x86_64__
+  if (f21 () != 0 || f22 (-1ULL) != 0
+      || f23 () != 0x1f || f24 (-1ULL) != 0x1f
+      || f25 () != 0x7fffffffffffffffULL || f26 (-1ULL) != 0x7fffffffffffffffULL
+      || f27 () != -1ULL || f28 (-1ULL) != -1ULL
+      || f29 () != -1ULL || f30 (-1ULL) != -1ULL
+      || f31 () != 1 || f32 (-1ULL) != 1
+      || f33 () != -1ULL || f34 (-1ULL) != -1ULL)
+    abort ();
+#endif
+}
diff --git a/gcc/testsuite/gcc.target/i386/noplt-1.c b/gcc/testsuite/gcc.target/i386/noplt-1.c
new file mode 100644
index 00000000000..d9e5d6ea7ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noplt-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target x86_64-*-linux* } } */
+/* { dg-options "-fno-pic" } */
+
+__attribute__ ((noplt))
+void foo();
+
+int main()
+{
+  foo();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]\\*.*foo.*@GOTPCREL\\(%rip\\)" } } */ 
diff --git a/gcc/testsuite/gcc.target/i386/noplt-2.c b/gcc/testsuite/gcc.target/i386/noplt-2.c
new file mode 100644
index 00000000000..4df0618b679
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noplt-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target x86_64-*-linux* } } */
+/* { dg-options "-O2 -fno-pic" } */
+
+
+__attribute__ ((noplt))
+int foo();
+
+int main()
+{
+  return foo();
+}
+
+/* { dg-final { scan-assembler "jmp\[ \t\]\\*.*foo.*@GOTPCREL\\(%rip\\)" } } */ 
diff --git a/gcc/testsuite/gcc.target/i386/noplt-3.c b/gcc/testsuite/gcc.target/i386/noplt-3.c
new file mode 100644
index 00000000000..e2a6f938649
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noplt-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target x86_64-*-linux* } } */
+/* { dg-options "-fno-pic -fno-plt" } */
+
+void foo();
+
+int main()
+{
+  foo();
+  return 0;
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]\\*.*foo.*@GOTPCREL\\(%rip\\)" } } */ 
diff --git a/gcc/testsuite/gcc.target/i386/noplt-4.c b/gcc/testsuite/gcc.target/i386/noplt-4.c
new file mode 100644
index 00000000000..d9039dd77c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noplt-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target x86_64-*-linux* } } */
+/* { dg-options "-O2 -fno-pic -fno-plt" } */
+
+int foo();
+
+int main()
+{
+  return foo();
+}
+
+/* { dg-final { scan-assembler "jmp\[ \t\]\\*.*foo.*@GOTPCREL\\(%rip\\)" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc/testsuite/gcc.target/i386/pr52252-atom.c
new file mode 100644
index 00000000000..715b4594382
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr52252-atom.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "palignr" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc/testsuite/gcc.target/i386/pr52252-core.c
new file mode 100644
index 00000000000..ac857a5fe7e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr52252-core.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "pshufb" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc/testsuite/gcc.target/i386/pr61403.c
new file mode 100644
index 00000000000..84cc5c5c80a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr61403.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */
+
+#include <math.h>
+
+struct XYZ
+{
+  float x;
+  float y;
+  float z;
+};
+
+void
+norm (struct XYZ *in, struct XYZ *out, int size)
+{
+  int i;
+  for (i = 0; i < size; ++i)
+    {
+      float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z);
+      out[i].x = in[i].x / n;
+      out[i].y = in[i].y / n;
+      out[i].z = in[i].z / n;
+    }
+}
+
+/* { dg-final { scan-assembler "blend" } } */
diff --git a/gcc/testsuite/gcc.target/mips/20140928.c b/gcc/testsuite/gcc.target/mips/20140928.c
new file mode 100644
index 00000000000..1b55bdd357c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/20140928.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+
+NOMIPS16 int NoBarrier_AtomicIncrement(volatile int* ptr, int increment) {
+  int temp, temp2;
+  __asm__ __volatile__(".set push\n"
+                       ".set noreorder\n"
+                       "1:\n"
+                       "ll %0, 0(%3)\n"
+                       "addu %1, %0, %2\n"
+                       "sc %1, 0(%3)\n"
+                       "beqz %1, 1b\n"
+                       "nop\n"
+                       "addu %1, %0, %2\n"
+                       ".set pop\n"
+                       : "=&r" (temp), "=&r" (temp2)
+                       : "Ir" (increment), "r" (ptr)
+                       : "memory");
+
+  return temp2;
+}
diff --git a/gcc/testsuite/gcc.target/mips/args-1.c b/gcc/testsuite/gcc.target/mips/args-1.c
index 3a132deaf3b..643df24263c 100644
--- a/gcc/testsuite/gcc.target/mips/args-1.c
+++ b/gcc/testsuite/gcc.target/mips/args-1.c
@@ -5,7 +5,7 @@
 const char *compiled_for = _MIPS_ARCH;
 const char *optimized_for = _MIPS_TUNE;
 
-#if __mips_fpr != 32 && __mips_fpr != 64
+#if __mips_fpr != 32 && __mips_fpr != 64 && __mips_fpr != 0
 #error Bad __mips_fpr
 #endif
 
diff --git a/gcc/testsuite/gcc.target/mips/args-3.c b/gcc/testsuite/gcc.target/mips/args-3.c
index 6a79ce6745e..5eddabf8371 100644
--- a/gcc/testsuite/gcc.target/mips/args-3.c
+++ b/gcc/testsuite/gcc.target/mips/args-3.c
@@ -24,7 +24,7 @@ int foo (float inf, int64 in64, int32 in32)
     abort ();
 #endif
 
-#if (__mips == 4 || __mips == 32 || __mips == 64) && !defined (__mips16)
+#if (__mips == 4 || ((__mips == 32 || __mips == 64) && __mips_isa_rev < 6)) && !defined (__mips16)
   __asm__ ("move %0,%.\n\tmovn %0,%1,%2"
 	   : "=&r" (res32) : "r" (in32), "r" (in64 != 0));
   if (res32 != 60)
diff --git a/gcc/testsuite/gcc.target/mips/asm-1.c b/gcc/testsuite/gcc.target/mips/asm-1.c
index 8df2689469e..2408b250009 100644
--- a/gcc/testsuite/gcc.target/mips/asm-1.c
+++ b/gcc/testsuite/gcc.target/mips/asm-1.c
@@ -2,6 +2,8 @@
    of the call.  */
 /* { dg-do assemble } */
 
+extern void bar (void);
+
 NOMIPS16 int foo (int n)
 {
   register int k asm ("$16") = n;
diff --git a/gcc/testsuite/gcc.target/mips/branch-10.c b/gcc/testsuite/gcc.target/mips/branch-10.c
index e2b1b5f6baa..6a4d9209b61 100644
--- a/gcc/testsuite/gcc.target/mips/branch-10.c
+++ b/gcc/testsuite/gcc.target/mips/branch-10.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=n32" } */
 /* { dg-final { scan-assembler-not "(\\\$28|%gp_rel|%got)" } } */
-/* { dg-final { scan-assembler-not "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler-not "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-11.c b/gcc/testsuite/gcc.target/mips/branch-11.c
index 962eb1b5d66..0333404cf8b 100644
--- a/gcc/testsuite/gcc.target/mips/branch-11.c
+++ b/gcc/testsuite/gcc.target/mips/branch-11.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler "\taddiu\t\\\$28,\\\$28,%lo\\(%neg\\(%gp_rel\\(foo\\)\\)\\)\n" } } */
 /* { dg-final { scan-assembler "\tlw\t\\\$1,%got_page\\(\[^)\]*\\)\\(\\\$28\\)\n" } } */
 /* { dg-final { scan-assembler "\taddiu\t\\\$1,\\\$1,%got_ofst\\(\[^)\]*\\)\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-12.c b/gcc/testsuite/gcc.target/mips/branch-12.c
index 4aef160ade8..c58316dcdfc 100644
--- a/gcc/testsuite/gcc.target/mips/branch-12.c
+++ b/gcc/testsuite/gcc.target/mips/branch-12.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=64" } */
 /* { dg-final { scan-assembler-not "(\\\$28|%gp_rel|%got)" } } */
-/* { dg-final { scan-assembler-not "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler-not "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-13.c b/gcc/testsuite/gcc.target/mips/branch-13.c
index 8a6fb049f24..7c8bc1a1d11 100644
--- a/gcc/testsuite/gcc.target/mips/branch-13.c
+++ b/gcc/testsuite/gcc.target/mips/branch-13.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler "\tdaddiu\t\\\$28,\\\$28,%lo\\(%neg\\(%gp_rel\\(foo\\)\\)\\)\n" } } */
 /* { dg-final { scan-assembler "\tld\t\\\$1,%got_page\\(\[^)\]*\\)\\(\\\$28\\)\n" } } */
 /* { dg-final { scan-assembler "\tdaddiu\t\\\$1,\\\$1,%got_ofst\\(\[^)\]*\\)\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-3.c b/gcc/testsuite/gcc.target/mips/branch-3.c
index 5fcfece3e9a..2eb20e57470 100644
--- a/gcc/testsuite/gcc.target/mips/branch-3.c
+++ b/gcc/testsuite/gcc.target/mips/branch-3.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=32" } */
 /* { dg-final { scan-assembler "\t\\.cpload\t\\\$25\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 /* { dg-final { scan-assembler-not "\\.cprestore" } } */
 
 #include "branch-helper.h"
diff --git a/gcc/testsuite/gcc.target/mips/branch-4.c b/gcc/testsuite/gcc.target/mips/branch-4.c
index 31e4909e58f..df82c5d9ddd 100644
--- a/gcc/testsuite/gcc.target/mips/branch-4.c
+++ b/gcc/testsuite/gcc.target/mips/branch-4.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=n32" } */
 /* { dg-final { scan-assembler-not "(\\\$25|\\\$28|%gp_rel|%got)" } } */
-/* { dg-final { scan-assembler-not "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler-not "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-5.c b/gcc/testsuite/gcc.target/mips/branch-5.c
index 1e9c120c834..3e7f530cfac 100644
--- a/gcc/testsuite/gcc.target/mips/branch-5.c
+++ b/gcc/testsuite/gcc.target/mips/branch-5.c
@@ -1,7 +1,7 @@
 /* { dg-options "-mshared -mabi=n32" } */
 /* { dg-final { scan-assembler "\taddiu\t\\\$3,\\\$3,%lo\\(%neg\\(%gp_rel\\(foo\\)\\)\\)\n" } } */
 /* { dg-final { scan-assembler "\tlw\t\\\$1,%got_page\\(\[^)\]*\\)\\(\\\$3\\)\\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 /* { dg-final { scan-assembler-not "\\\$28" } } */
 
 #include "branch-helper.h"
diff --git a/gcc/testsuite/gcc.target/mips/branch-6.c b/gcc/testsuite/gcc.target/mips/branch-6.c
index 77e0340eb2e..1bccd1eb67c 100644
--- a/gcc/testsuite/gcc.target/mips/branch-6.c
+++ b/gcc/testsuite/gcc.target/mips/branch-6.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=64" } */
 /* { dg-final { scan-assembler-not "(\\\$25|\\\$28|%gp_rel|%got)" } } */
-/* { dg-final { scan-assembler-not "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler-not "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-7.c b/gcc/testsuite/gcc.target/mips/branch-7.c
index 8ad6808c8df..bb55a2558ce 100644
--- a/gcc/testsuite/gcc.target/mips/branch-7.c
+++ b/gcc/testsuite/gcc.target/mips/branch-7.c
@@ -1,7 +1,7 @@
 /* { dg-options "-mshared -mabi=64" } */
 /* { dg-final { scan-assembler "\tdaddiu\t\\\$3,\\\$3,%lo\\(%neg\\(%gp_rel\\(foo\\)\\)\\)\n" } } */
 /* { dg-final { scan-assembler "\tld\t\\\$1,%got_page\\(\[^)\]*\\)\\(\\\$3\\)\\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 /* { dg-final { scan-assembler-not "\\\$28" } } */
 
 #include "branch-helper.h"
diff --git a/gcc/testsuite/gcc.target/mips/branch-8.c b/gcc/testsuite/gcc.target/mips/branch-8.c
index ba5f954378c..c9b2c3ec655 100644
--- a/gcc/testsuite/gcc.target/mips/branch-8.c
+++ b/gcc/testsuite/gcc.target/mips/branch-8.c
@@ -1,6 +1,6 @@
 /* { dg-options "-mshared -mabi=32" } */
 /* { dg-final { scan-assembler-not "(\\\$28|cpload|cprestore)" } } */
-/* { dg-final { scan-assembler-not "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler-not "\tjrc?\t\\\$1\n" } } */
 
 #include "branch-helper.h"
 
diff --git a/gcc/testsuite/gcc.target/mips/branch-9.c b/gcc/testsuite/gcc.target/mips/branch-9.c
index cad1c003c3e..039d4b76e40 100644
--- a/gcc/testsuite/gcc.target/mips/branch-9.c
+++ b/gcc/testsuite/gcc.target/mips/branch-9.c
@@ -4,7 +4,7 @@
 /* { dg-final { scan-assembler "\tlw\t\\\$1,16\\(\\\$(fp|sp)\\)\n" } } */
 /* { dg-final { scan-assembler "\tlw\t\\\$1,%got\\(\[^)\]*\\)\\(\\\$1\\)\n" } } */
 /* { dg-final { scan-assembler "\taddiu\t\\\$1,\\\$1,%lo\\(\[^)\]*\\)\n" } } */
-/* { dg-final { scan-assembler "\tjr\t\\\$1\n" } } */
+/* { dg-final { scan-assembler "\tjrc?\t\\\$1\n" } } */
 /* { dg-final { scan-assembler-not "\\\$28" } } */
 
 #include "branch-helper.h"
diff --git a/gcc/testsuite/gcc.target/mips/branch-cost-1.c b/gcc/testsuite/gcc.target/mips/branch-cost-1.c
index f72f2acfb3a..61c3029dd77 100644
--- a/gcc/testsuite/gcc.target/mips/branch-cost-1.c
+++ b/gcc/testsuite/gcc.target/mips/branch-cost-1.c
@@ -6,4 +6,4 @@ foo (int x, int y, int z, int k)
   return x == k ? x + y : z - x;
 }
 /* { dg-final { scan-assembler-not "\t(movz|movn)\t" } } */
-/* { dg-final { scan-assembler "\t(bne|beq)\t" } } */
+/* { dg-final { scan-assembler "\t(bnec?|beqc?)\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/branch-cost-2.c b/gcc/testsuite/gcc.target/mips/branch-cost-2.c
index 3b2c4a13e5e..5a422ae29b4 100644
--- a/gcc/testsuite/gcc.target/mips/branch-cost-2.c
+++ b/gcc/testsuite/gcc.target/mips/branch-cost-2.c
@@ -1,4 +1,4 @@
-/* { dg-options "-mbranch-cost=10 isa>=4" } */
+/* { dg-options "-mbranch-cost=10 (HAS_MOVN)" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 NOMIPS16 int
 foo (int x, int y, int z, int k)
diff --git a/gcc/testsuite/gcc.target/mips/call-1.c b/gcc/testsuite/gcc.target/mips/call-1.c
index e4b7acefaf7..46a2536754b 100644
--- a/gcc/testsuite/gcc.target/mips/call-1.c
+++ b/gcc/testsuite/gcc.target/mips/call-1.c
@@ -1,17 +1,18 @@
 /* { dg-options "-mrelax-pic-calls -mshared -foptimize-sibling-calls -mabi=32" } */
 /* { dg-skip-if "requires -foptimize-sibling-calls" { *-*-* } { "-O0" } { "" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalrs?\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalrs?\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalrs?\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjr\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalrc?s?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalrc?s?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalrc?s?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjrc?\t" } } */
 
 __attribute__ ((noinline)) static void staticfunc () { asm (""); }
 int normal ();
 void normal2 ();
 
+int
 NOMIPS16 f (int *p)
 {
   *p = normal ();
@@ -22,6 +23,7 @@ NOMIPS16 f (int *p)
 
 int tail ();
 
+int
 NOMIPS16 h ()
 {
   return tail ();
diff --git a/gcc/testsuite/gcc.target/mips/call-2.c b/gcc/testsuite/gcc.target/mips/call-2.c
index c2fc8eaad12..175933cbe77 100644
--- a/gcc/testsuite/gcc.target/mips/call-2.c
+++ b/gcc/testsuite/gcc.target/mips/call-2.c
@@ -1,7 +1,10 @@
 /* See through some simple data-flow.  */
 /* { dg-options "-mrelax-pic-calls" } */
-/* { dg-final { scan-assembler-times "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalrs?\t" 2 } } */
+/* { dg-final { scan-assembler-times "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalrc?s?\t" 2 } } */
 
+extern void g (void);
+
+int
 NOMIPS16 f ()
 {
   g ();
diff --git a/gcc/testsuite/gcc.target/mips/call-3.c b/gcc/testsuite/gcc.target/mips/call-3.c
index 37609088df0..08cf336a424 100644
--- a/gcc/testsuite/gcc.target/mips/call-3.c
+++ b/gcc/testsuite/gcc.target/mips/call-3.c
@@ -1,9 +1,10 @@
 /* { dg-options "-mrelax-pic-calls -mno-shared" } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalrs?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalrc?s?\t" } } */
 /* { dg-require-visibility "" } */
 
 __attribute__ ((visibility ("hidden"))) void g ();
 
+int
 NOMIPS16 f ()
 {
   g ();
diff --git a/gcc/testsuite/gcc.target/mips/call-4.c b/gcc/testsuite/gcc.target/mips/call-4.c
index 049e33882fa..bf357c7a5b0 100644
--- a/gcc/testsuite/gcc.target/mips/call-4.c
+++ b/gcc/testsuite/gcc.target/mips/call-4.c
@@ -1,7 +1,10 @@
 /* See through some simple data-flow.  */
 /* { dg-options "-mrelax-pic-calls" } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalr\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,g\n1:\tjalrc?\t" } } */
 
+extern void g (void);
+
+int
 NOMIPS16 f (int i)
 {
   while (i--)
diff --git a/gcc/testsuite/gcc.target/mips/call-5.c b/gcc/testsuite/gcc.target/mips/call-5.c
index 2e58178ba94..f6ebae9db79 100644
--- a/gcc/testsuite/gcc.target/mips/call-5.c
+++ b/gcc/testsuite/gcc.target/mips/call-5.c
@@ -2,18 +2,19 @@
    in this case (PR target/57260).  */
 /* { dg-options "-mrelax-pic-calls -mshared -foptimize-sibling-calls -mabi=n32" } */
 /* { dg-skip-if "requires -foptimize-sibling-calls" { *-*-* } { "-O0" } { "" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjr\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjrc?\t" } } */
 
 __attribute__ ((noinline)) static void staticfunc () { asm (""); }
 int normal ();
 void normal2 ();
 
+int
 NOMIPS16 f (int *p)
 {
   *p = normal ();
@@ -24,6 +25,7 @@ NOMIPS16 f (int *p)
 
 int tail ();
 
+int
 NOMIPS16 h ()
 {
   return tail ();
diff --git a/gcc/testsuite/gcc.target/mips/call-6.c b/gcc/testsuite/gcc.target/mips/call-6.c
index 86f3dc4a8fe..00f4a1ef353 100644
--- a/gcc/testsuite/gcc.target/mips/call-6.c
+++ b/gcc/testsuite/gcc.target/mips/call-6.c
@@ -1,18 +1,19 @@
 /* Like call-5.c, but for n64.  */
 /* { dg-options "-mrelax-pic-calls -mshared -foptimize-sibling-calls -mabi=64" } */
 /* { dg-skip-if "requires -foptimize-sibling-calls" { *-*-* } { "-O0" } { "" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjalr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjr\t" } } */
-/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjr\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,normal2\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,staticfunc\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail2\n1:\tjalrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail3\n1:\tjrc?\t" } } */
+/* { dg-final { scan-assembler "\\.reloc\t1f,R_MIPS_JALR,tail4\n1:\tjrc?\t" } } */
 
 __attribute__ ((noinline)) static void staticfunc () { asm (""); }
 int normal ();
 void normal2 ();
 
+int
 NOMIPS16 f (int *p)
 {
   *p = normal ();
@@ -23,6 +24,7 @@ NOMIPS16 f (int *p)
 
 int tail ();
 
+int
 NOMIPS16 h ()
 {
   return tail ();
diff --git a/gcc/testsuite/gcc.target/mips/call-clobbered-1.c b/gcc/testsuite/gcc.target/mips/call-clobbered-1.c
new file mode 100644
index 00000000000..77294aa3c2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-clobbered-1.c
@@ -0,0 +1,21 @@
+/* Check that we handle call-clobbered FPRs correctly.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "isa>=2 -mabi=32 -mhard-float -ffixed-f0 -ffixed-f1 -ffixed-f2 -ffixed-f3 -ffixed-f4 -ffixed-f5 -ffixed-f6 -ffixed-f7 -ffixed-f8 -ffixed-f9 -ffixed-f10 -ffixed-f11 -ffixed-f12 -ffixed-f13 -ffixed-f14 -ffixed-f15 -ffixed-f16 -ffixed-f17 -ffixed-f18 -ffixed-f19" } */
+
+void bar (void);
+double a;
+double
+foo ()
+{
+  double b = a + 1.0;
+  bar();
+  return b;
+}
+/* { dg-final { scan-assembler-not "lwc1" } } */
+/* { dg-final { scan-assembler-not "swc1" } } */
+/* { dg-final { scan-assembler-times "sdc1" 2 } } */
+/* { dg-final { scan-assembler-times "ldc1" 4 } } */
+/* { dg-final { scan-assembler-not "mtc" } } */
+/* { dg-final { scan-assembler-not "mfc" } } */
+/* { dg-final { scan-assembler-not "mthc" } } */
+/* { dg-final { scan-assembler-not "mfhc" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-clobbered-2.c b/gcc/testsuite/gcc.target/mips/call-clobbered-2.c
new file mode 100644
index 00000000000..5f9a47208c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-clobbered-2.c
@@ -0,0 +1,21 @@
+/* Check that we handle call-clobbered FPRs correctly.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -modd-spreg -mfp32 -ffixed-f0 -ffixed-f1 -ffixed-f2 -ffixed-f3 -ffixed-f4 -ffixed-f5 -ffixed-f6 -ffixed-f7 -ffixed-f8 -ffixed-f9 -ffixed-f10 -ffixed-f11 -ffixed-f12 -ffixed-f13 -ffixed-f14 -ffixed-f15 -ffixed-f16 -ffixed-f17 -ffixed-f18 -ffixed-f19 -ffixed-f20 -ffixed-f22 -ffixed-f24 -ffixed-f26 -ffixed-f28 -ffixed-f30" } */
+
+void bar (void);
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  bar();
+  return b;
+}
+/* { dg-final { scan-assembler-times "lwc1" 4 } } */
+/* { dg-final { scan-assembler-times "swc1" 2 } } */
+/* { dg-final { scan-assembler-not "mtc" } } */
+/* { dg-final { scan-assembler-not "mfc" } } */
+/* { dg-final { scan-assembler-not "mthc" } } */
+/* { dg-final { scan-assembler-not "mfhc" } } */
+/* { dg-final { scan-assembler-not "sdc1" } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-clobbered-3.c b/gcc/testsuite/gcc.target/mips/call-clobbered-3.c
new file mode 100644
index 00000000000..fce4d991245
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-clobbered-3.c
@@ -0,0 +1,23 @@
+/* Check that we handle call-clobbered FPRs correctly.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* Refer to call-clobbered-4.c to see the expected output from -Os builds.  */
+/* { dg-skip-if "uses callee-saved GPR" { *-*-* } { "-Os" } { "" } } */
+/* { dg-options "-mabi=32 -modd-spreg -mfpxx -ffixed-f0 -ffixed-f1 -ffixed-f2 -ffixed-f3 -ffixed-f4 -ffixed-f5 -ffixed-f6 -ffixed-f7 -ffixed-f8 -ffixed-f9 -ffixed-f10 -ffixed-f11 -ffixed-f12 -ffixed-f13 -ffixed-f14 -ffixed-f15 -ffixed-f16 -ffixed-f17 -ffixed-f18 -ffixed-f19 -ffixed-f20 -ffixed-f22 -ffixed-f24 -ffixed-f26 -ffixed-f28 -ffixed-f30" } */
+
+void bar (void);
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  bar();
+  return b;
+}
+/* { dg-final { scan-assembler-times "lwc1" 5 } } */
+/* { dg-final { scan-assembler-times "swc1" 3 } } */
+/* { dg-final { scan-assembler-not "mtc" } } */
+/* { dg-final { scan-assembler-not "mfc" } } */
+/* { dg-final { scan-assembler-not "mthc" } } */
+/* { dg-final { scan-assembler-not "mfhc" } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
+/* { dg-final { scan-assembler-not "sdc1" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-clobbered-4.c b/gcc/testsuite/gcc.target/mips/call-clobbered-4.c
new file mode 100644
index 00000000000..51498b8fa28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-clobbered-4.c
@@ -0,0 +1,23 @@
+/* Check that we handle call-clobbered FPRs correctly.
+   This test differs from call-clobbered-3.c because when optimising for size
+   a callee-saved GPR is used for 'b' to cross the call.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "*" } { "-Os" } } */
+/* { dg-options "-mabi=32 -modd-spreg -mfpxx -ffixed-f0 -ffixed-f1 -ffixed-f2 -ffixed-f3 -ffixed-f4 -ffixed-f5 -ffixed-f6 -ffixed-f7 -ffixed-f8 -ffixed-f9 -ffixed-f10 -ffixed-f11 -ffixed-f12 -ffixed-f13 -ffixed-f14 -ffixed-f15 -ffixed-f16 -ffixed-f17 -ffixed-f18 -ffixed-f19 -ffixed-f20 -ffixed-f22 -ffixed-f24 -ffixed-f26 -ffixed-f28 -ffixed-f30" } */
+
+void bar (void);
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  bar();
+  return b;
+}
+/* { dg-final { scan-assembler-times "lwc1" 4 } } */
+/* { dg-final { scan-assembler-times "swc1" 2 } } */
+/* { dg-final { scan-assembler-times "mtc" 1 } } */
+/* { dg-final { scan-assembler-times "mfc" 1 } } */
+/* { dg-final { scan-assembler-not "mthc" } } */
+/* { dg-final { scan-assembler-not "mfhc" } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
+/* { dg-final { scan-assembler-not "sdc1" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-clobbered-5.c b/gcc/testsuite/gcc.target/mips/call-clobbered-5.c
new file mode 100644
index 00000000000..c7cd7cac7dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-clobbered-5.c
@@ -0,0 +1,21 @@
+/* Check that we handle call-clobbered FPRs correctly.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -mfp64 -ffixed-f0 -ffixed-f1 -ffixed-f2 -ffixed-f3 -ffixed-f4 -ffixed-f5 -ffixed-f6 -ffixed-f7 -ffixed-f8 -ffixed-f9 -ffixed-f10 -ffixed-f11 -ffixed-f12 -ffixed-f13 -ffixed-f14 -ffixed-f15 -ffixed-f16 -ffixed-f17 -ffixed-f18 -ffixed-f19 -ffixed-f20 -ffixed-f22 -ffixed-f24 -ffixed-f26 -ffixed-f28 -ffixed-f30" } */
+
+void bar (void);
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  bar();
+  return b;
+}
+/* { dg-final { scan-assembler-times "lwc1" 3 } } */
+/* { dg-final { scan-assembler-times "swc1" 1 } } */
+/* { dg-final { scan-assembler-not "sdc1" } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
+/* { dg-final { scan-assembler-not "mtc" } } */
+/* { dg-final { scan-assembler-not "mfc" } } */
+/* { dg-final { scan-assembler-not "mthc" } } */
+/* { dg-final { scan-assembler-not "mfhc" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-saved-4.c b/gcc/testsuite/gcc.target/mips/call-saved-4.c
new file mode 100644
index 00000000000..846ea321e7c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-saved-4.c
@@ -0,0 +1,32 @@
+/* Check that we save the correct call-saved GPRs and FPRs.  */
+/* { dg-options "(HAS_LDC) -mabi=32 -mfp32" } */
+
+void bar (void);
+
+void
+foo (int x)
+{
+  __builtin_unwind_init ();
+  __builtin_eh_return (x, bar);
+}
+/* { dg-final { scan-assembler "\\\$16" } } */
+/* { dg-final { scan-assembler "\\\$17" } } */
+/* { dg-final { scan-assembler "\\\$18" } } */
+/* { dg-final { scan-assembler "\\\$19" } } */
+/* { dg-final { scan-assembler "\\\$20" } } */
+/* { dg-final { scan-assembler "\\\$21" } } */
+/* { dg-final { scan-assembler "\\\$22" } } */
+/* { dg-final { scan-assembler "\\\$23" } } */
+/* { dg-final { scan-assembler "\\\$(30|fp)" } } */
+/* { dg-final { scan-assembler "\\\$f20" } } */
+/* { dg-final { scan-assembler "\\\$f22" } } */
+/* { dg-final { scan-assembler "\\\$f24" } } */
+/* { dg-final { scan-assembler "\\\$f26" } } */
+/* { dg-final { scan-assembler "\\\$f28" } } */
+/* { dg-final { scan-assembler "\\\$f30" } } */
+/* { dg-final { scan-assembler-not "\\\$f21" } } */
+/* { dg-final { scan-assembler-not "\\\$f23" } } */
+/* { dg-final { scan-assembler-not "\\\$f25" } } */
+/* { dg-final { scan-assembler-not "\\\$f27" } } */
+/* { dg-final { scan-assembler-not "\\\$f29" } } */
+/* { dg-final { scan-assembler-not "\\\$f31" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-saved-5.c b/gcc/testsuite/gcc.target/mips/call-saved-5.c
new file mode 100644
index 00000000000..2937b316dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-saved-5.c
@@ -0,0 +1,32 @@
+/* Check that we save the correct call-saved GPRs and FPRs.  */
+/* { dg-options "-mabi=32 -mfpxx" } */
+
+void bar (void);
+
+void
+foo (int x)
+{
+  __builtin_unwind_init ();
+  __builtin_eh_return (x, bar);
+}
+/* { dg-final { scan-assembler "\\\$16" } } */
+/* { dg-final { scan-assembler "\\\$17" } } */
+/* { dg-final { scan-assembler "\\\$18" } } */
+/* { dg-final { scan-assembler "\\\$19" } } */
+/* { dg-final { scan-assembler "\\\$20" } } */
+/* { dg-final { scan-assembler "\\\$21" } } */
+/* { dg-final { scan-assembler "\\\$22" } } */
+/* { dg-final { scan-assembler "\\\$23" } } */
+/* { dg-final { scan-assembler "\\\$(30|fp)" } } */
+/* { dg-final { scan-assembler "\\\$f20" } } */
+/* { dg-final { scan-assembler "\\\$f22" } } */
+/* { dg-final { scan-assembler "\\\$f24" } } */
+/* { dg-final { scan-assembler "\\\$f26" } } */
+/* { dg-final { scan-assembler "\\\$f28" } } */
+/* { dg-final { scan-assembler "\\\$f30" } } */
+/* { dg-final { scan-assembler-not "\\\$f21" } } */
+/* { dg-final { scan-assembler-not "\\\$f23" } } */
+/* { dg-final { scan-assembler-not "\\\$f25" } } */
+/* { dg-final { scan-assembler-not "\\\$f27" } } */
+/* { dg-final { scan-assembler-not "\\\$f29" } } */
+/* { dg-final { scan-assembler-not "\\\$f31" } } */
diff --git a/gcc/testsuite/gcc.target/mips/call-saved-6.c b/gcc/testsuite/gcc.target/mips/call-saved-6.c
new file mode 100644
index 00000000000..0d1a4c845c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/call-saved-6.c
@@ -0,0 +1,32 @@
+/* Check that we save the correct call-saved GPRs and FPRs.  */
+/* { dg-options "-mabi=32 -mfp64" } */
+
+void bar (void);
+
+void
+foo (int x)
+{
+  __builtin_unwind_init ();
+  __builtin_eh_return (x, bar);
+}
+/* { dg-final { scan-assembler "\\\$16" } } */
+/* { dg-final { scan-assembler "\\\$17" } } */
+/* { dg-final { scan-assembler "\\\$18" } } */
+/* { dg-final { scan-assembler "\\\$19" } } */
+/* { dg-final { scan-assembler "\\\$20" } } */
+/* { dg-final { scan-assembler "\\\$21" } } */
+/* { dg-final { scan-assembler "\\\$22" } } */
+/* { dg-final { scan-assembler "\\\$23" } } */
+/* { dg-final { scan-assembler "\\\$(30|fp)" } } */
+/* { dg-final { scan-assembler "\\\$f20" } } */
+/* { dg-final { scan-assembler "\\\$f22" } } */
+/* { dg-final { scan-assembler "\\\$f24" } } */
+/* { dg-final { scan-assembler "\\\$f26" } } */
+/* { dg-final { scan-assembler "\\\$f28" } } */
+/* { dg-final { scan-assembler "\\\$f30" } } */
+/* { dg-final { scan-assembler-not "\\\$f21" } } */
+/* { dg-final { scan-assembler-not "\\\$f23" } } */
+/* { dg-final { scan-assembler-not "\\\$f25" } } */
+/* { dg-final { scan-assembler-not "\\\$f27" } } */
+/* { dg-final { scan-assembler-not "\\\$f29" } } */
+/* { dg-final { scan-assembler-not "\\\$f31" } } */
diff --git a/gcc/testsuite/gcc.target/mips/code-readable-1.c b/gcc/testsuite/gcc.target/mips/code-readable-1.c
index b3e864df6fb..71e7114ffcc 100644
--- a/gcc/testsuite/gcc.target/mips/code-readable-1.c
+++ b/gcc/testsuite/gcc.target/mips/code-readable-1.c
@@ -14,7 +14,7 @@ volatile int x10;
 volatile int x11;
 
 MIPS16 int
-foo (int i, volatile *x)
+foo (int i, volatile int *x)
 {
   switch (i)
     {
diff --git a/gcc/testsuite/gcc.target/mips/code-readable-2.c b/gcc/testsuite/gcc.target/mips/code-readable-2.c
index 3d325049d59..1994bebcebf 100644
--- a/gcc/testsuite/gcc.target/mips/code-readable-2.c
+++ b/gcc/testsuite/gcc.target/mips/code-readable-2.c
@@ -13,7 +13,7 @@ volatile int x10;
 volatile int x11;
 
 MIPS16 int
-foo (int i, volatile *x)
+foo (int i, volatile int *x)
 {
   switch (i)
     {
diff --git a/gcc/testsuite/gcc.target/mips/code-readable-3.c b/gcc/testsuite/gcc.target/mips/code-readable-3.c
index aaf18749374..c19e80aafa3 100644
--- a/gcc/testsuite/gcc.target/mips/code-readable-3.c
+++ b/gcc/testsuite/gcc.target/mips/code-readable-3.c
@@ -13,7 +13,7 @@ volatile int x10;
 volatile int x11;
 
 MIPS16 int
-foo (int i, volatile *x)
+foo (int i, volatile int *x)
 {
   switch (i)
     {
diff --git a/gcc/testsuite/gcc.target/mips/code-readable-4.c b/gcc/testsuite/gcc.target/mips/code-readable-4.c
index 4db89f87466..beb9248de75 100644
--- a/gcc/testsuite/gcc.target/mips/code-readable-4.c
+++ b/gcc/testsuite/gcc.target/mips/code-readable-4.c
@@ -14,7 +14,7 @@ volatile int x10;
 volatile int x11;
 
 MIPS16 int
-foo (int i, volatile *x)
+foo (int i, volatile int *x)
 {
   switch (i)
     {
diff --git a/gcc/testsuite/gcc.target/mips/const-anchor-1.c b/gcc/testsuite/gcc.target/mips/const-anchor-1.c
index a5f01e4ec1a..0d86bab8ca0 100644
--- a/gcc/testsuite/gcc.target/mips/const-anchor-1.c
+++ b/gcc/testsuite/gcc.target/mips/const-anchor-1.c
@@ -4,6 +4,8 @@
 /* { dg-final { scan-assembler-not "0x12330000|305332224" } } */
 /* { dg-final { scan-assembler "\td?addiu\t\\\$5,\\\$\[0-9\]*,-1" } } */
 
+extern void g (int, int);
+
 NOMIPS16 void f ()
 {
   g (0x12340001, 0x1233ffff);
diff --git a/gcc/testsuite/gcc.target/mips/const-anchor-2.c b/gcc/testsuite/gcc.target/mips/const-anchor-2.c
index 8dad5a70b27..65fcb0be252 100644
--- a/gcc/testsuite/gcc.target/mips/const-anchor-2.c
+++ b/gcc/testsuite/gcc.target/mips/const-anchor-2.c
@@ -3,6 +3,8 @@
 /* { dg-final { scan-assembler-not "0x300000|196608" } } */
 /* { dg-final { scan-assembler "\td?addiu\t\\\$5,\\\$\[0-9\]*,32763" } } */
 
+extern void g (int, int);
+
 NOMIPS16 void f ()
 {
   g (0x28006, 0x30001);
diff --git a/gcc/testsuite/gcc.target/mips/dmult-1.c b/gcc/testsuite/gcc.target/mips/dmult-1.c
index f8c0b8b44f1..92573168d77 100644
--- a/gcc/testsuite/gcc.target/mips/dmult-1.c
+++ b/gcc/testsuite/gcc.target/mips/dmult-1.c
@@ -1,4 +1,4 @@
-/* { dg-options "forbid_cpu=octeon.* -mgp64" } */
+/* { dg-options "isa_rev<=5 forbid_cpu=octeon.* -mgp64" } */
 /* { dg-final { scan-assembler "\tdmult\t" } } */
 /* { dg-final { scan-assembler "\tmflo\t" } } */
 /* { dg-final { scan-assembler-not "\tdmul\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/dsp-lhx.c b/gcc/testsuite/gcc.target/mips/dsp-lhx.c
index 8fa20a09090..ebfe154e8e3 100644
--- a/gcc/testsuite/gcc.target/mips/dsp-lhx.c
+++ b/gcc/testsuite/gcc.target/mips/dsp-lhx.c
@@ -1,6 +1,6 @@
 /* Test MIPS32 DSP LHX instruction */
 /* { dg-do compile } */
-/* { dg-options "-mgp32 -mdsp" } */
+/* { dg-options "-mgp32 -mdsp (!HAS_LSA)" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 
 /* { dg-final { scan-assembler "\tlhx\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/dsp-lsa.c b/gcc/testsuite/gcc.target/mips/dsp-lsa.c
new file mode 100644
index 00000000000..9aec977dc0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/dsp-lsa.c
@@ -0,0 +1,11 @@
+/* Test MIPS32 DSP instructions - should use LSA instead of LHX */
+/* { dg-do compile } */
+/* { dg-options "-mgp32 -mdsp (HAS_LSA)" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+
+/* { dg-final { scan-assembler "\tlsa\t" } } */
+
+NOMIPS16 signed short test (signed short *a, int index)
+{
+  return a[index];
+}
diff --git a/gcc/testsuite/gcc.target/mips/dspr2-MULT.c b/gcc/testsuite/gcc.target/mips/dspr2-MULT.c
index b668e0c6b51..fdf25a70759 100644
--- a/gcc/testsuite/gcc.target/mips/dspr2-MULT.c
+++ b/gcc/testsuite/gcc.target/mips/dspr2-MULT.c
@@ -6,7 +6,7 @@
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 
 /* See PR target/51729 for the reason behind the XFAILs.  */
-/* { dg-final { scan-assembler "\tmult\t" } } */
+/* { dg-final { scan-assembler "\tmult?\t" } } */
 /* { dg-final { scan-assembler "\\\$ac1" { xfail *-*-* } } } */
 /* { dg-final { scan-assembler "\\\$ac2" { xfail *-*-* } } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/dspr2-MULTU.c b/gcc/testsuite/gcc.target/mips/dspr2-MULTU.c
index 886e4ca8816..293e34feed2 100644
--- a/gcc/testsuite/gcc.target/mips/dspr2-MULTU.c
+++ b/gcc/testsuite/gcc.target/mips/dspr2-MULTU.c
@@ -5,7 +5,7 @@
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 
 /* See PR target/51729 for the reason behind the XFAILs.  */
-/* { dg-final { scan-assembler "\tmultu\t" } } */
+/* { dg-final { scan-assembler "\t(multu|muhu)\t" } } */
 /* { dg-final { scan-assembler "\\\$ac1" { xfail *-*-* } } } */
 /* { dg-final { scan-assembler "\\\$ac2" { xfail *-*-* } } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/fpcmp-1.c b/gcc/testsuite/gcc.target/mips/fpcmp-1.c
index c0594ff3562..03c2f792612 100644
--- a/gcc/testsuite/gcc.target/mips/fpcmp-1.c
+++ b/gcc/testsuite/gcc.target/mips/fpcmp-1.c
@@ -1,5 +1,5 @@
 /* We used to use c.lt.fmt instead of c.ule.fmt here.  */
-/* { dg-options "-mhard-float" } */
+/* { dg-options "isa_rev<=5 -mhard-float" } */
 NOMIPS16 int f1 (float x, float y) { return __builtin_isless (x, y); }
 NOMIPS16 int f2 (double x, double y) { return __builtin_isless (x, y); }
 /* { dg-final { scan-assembler "\tc\\.ule\\.s\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/fpcmp-2.c b/gcc/testsuite/gcc.target/mips/fpcmp-2.c
index 23d5cb0c4ca..6936b9009d3 100644
--- a/gcc/testsuite/gcc.target/mips/fpcmp-2.c
+++ b/gcc/testsuite/gcc.target/mips/fpcmp-2.c
@@ -1,5 +1,5 @@
 /* We used to use c.le.fmt instead of c.ult.fmt here.  */
-/* { dg-options "-mhard-float" } */
+/* { dg-options "isa_rev<=5 -mhard-float" } */
 NOMIPS16 int f1 (float x, float y) { return __builtin_islessequal (x, y); }
 NOMIPS16 int f2 (double x, double y) { return __builtin_islessequal (x, y); }
 /* { dg-final { scan-assembler "\tc\\.ult\\.s\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/interrupt_handler.c b/gcc/testsuite/gcc.target/mips/interrupt_handler.c
index 073c772ae40..5058d998e07 100644
--- a/gcc/testsuite/gcc.target/mips/interrupt_handler.c
+++ b/gcc/testsuite/gcc.target/mips/interrupt_handler.c
@@ -3,6 +3,7 @@
 /* { dg-options "-mips32r2 -msoft-float" } */
 
 void f () { }
+extern void t (void);
 
 NOMIPS16 void __attribute__ ((interrupt)) v0 () { }
 NOMIPS16 void __attribute__ ((interrupt, use_shadow_register_set)) v1 () { }
diff --git a/gcc/testsuite/gcc.target/mips/lazy-binding-1.c b/gcc/testsuite/gcc.target/mips/lazy-binding-1.c
index a30594840dd..a112781a99e 100644
--- a/gcc/testsuite/gcc.target/mips/lazy-binding-1.c
+++ b/gcc/testsuite/gcc.target/mips/lazy-binding-1.c
@@ -19,6 +19,6 @@ foo (int n)
 /* There should be exactly five uses of $25: one to set up $gp, two to
    load the address of bar (), and two to call it.  */
 /* { dg-final { scan-assembler-times "\tl.\t\\\$25,%call16\\\(bar\\\)" 2 } } */
-/* { dg-final { scan-assembler-times "\tjalrs?\t\\\$25" 2 } } */
+/* { dg-final { scan-assembler-times "\tjalrc?s?\t\\\$25" 2 } } */
 /* { dg-final { scan-assembler "(\\\$28,|\t.cpload\t)\\\$25" } } */
 /* { dg-final { scan-assembler-times "\\\$25" 5 } } */
diff --git a/gcc/testsuite/gcc.target/mips/madd-3.c b/gcc/testsuite/gcc.target/mips/madd-3.c
index 29f4c9b3768..b0771ad9920 100644
--- a/gcc/testsuite/gcc.target/mips/madd-3.c
+++ b/gcc/testsuite/gcc.target/mips/madd-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "isa_rev>=1 -mgp32" } */
+/* { dg-options "(HAS_MADD) -mgp32" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-times "\tmadd\t" 3 } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/madd-9.c b/gcc/testsuite/gcc.target/mips/madd-9.c
index 28681a91002..acafc7a2be5 100644
--- a/gcc/testsuite/gcc.target/mips/madd-9.c
+++ b/gcc/testsuite/gcc.target/mips/madd-9.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "isa_rev>=1 -mgp32 -mtune=4kc" } */
+/* { dg-options "(HAS_MADD) -mgp32 -mtune=4kc" } */
 /* References to X within the loop need to have a higher frequency than
    references to X outside the loop, otherwise there is no reason
    to prefer multiply/accumulator registers over GPRs.  */
diff --git a/gcc/testsuite/gcc.target/mips/maddu-3.c b/gcc/testsuite/gcc.target/mips/maddu-3.c
index 27a7350f07b..e180fa74131 100644
--- a/gcc/testsuite/gcc.target/mips/maddu-3.c
+++ b/gcc/testsuite/gcc.target/mips/maddu-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* This test requires widening_mul */
-/* { dg-options "isa_rev>=1 -mgp32 -fexpensive-optimizations" } */
+/* { dg-options "(HAS_MADD) -mgp32 -fexpensive-optimizations" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-times "\tmaddu\t" 3 } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/memcpy-2.c b/gcc/testsuite/gcc.target/mips/memcpy-2.c
new file mode 100644
index 00000000000..ba5dad73904
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/memcpy-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "isa_rev<=5 -fdump-rtl-expand" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-Os" } { "" } } */
+
+__attribute__((nomips16))
+void
+f1 (char *p)
+{
+  __builtin_memcpy (p, "12345", 5);
+}
+
+/* { dg-final { scan-rtl-dump "mem/u.*mem/u" "expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-1.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-1.c
index 2f428717a32..faabca2d658 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-1.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-1.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_nothing ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-10.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-10.c
index 6c3601861ab..73e9705e78c 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-10.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-10.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-11.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-11.c
index 1d8a6d20cdf..5cdf0026e6b 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-11.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-11.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-12.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-12.c
index f57b5ce172a..986a6e5a732 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-12.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-12.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-13.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-13.c
index d2b88e1f384..1981ed1970b 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-13.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-13.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_nothing ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-14.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-14.c
index 6318a2240af..daed73409c9 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-14.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-14.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-15.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-15.c
index 1c165043a55..0c22454d30d 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-15.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-15.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-16.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-16.c
index 3119979d354..669f57d9f58 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-16.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-16.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-2.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-2.c
index 8a66e7a7af6..960c9a5d59b 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-2.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-2.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-3.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-3.c
index c9c8dac7081..c18495c2fff 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-3.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-3.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-4.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-4.c
index c10c213ac78..da95d160d14 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-4.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-4.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_addr_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-5.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-5.c
index 9b6dd8aff2e..ed65140bb3c 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-5.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-5.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   nonpic_nothing ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-6.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-6.c
index 90b220f0f0f..991c3643efd 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-6.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-6.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   pic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-7.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-7.c
index 8cef63f3c44..4905d33d72e 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-7.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-7.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   pic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-8.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-8.c
index 0200bf2dde3..627d01634af 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-8.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-8.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   pic_call ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-9.c b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-9.c
index 4144172ccd4..da22e530882 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/main-9.c
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/main-9.c
@@ -2,6 +2,7 @@
 
 #include "mips-nonpic.h"
 
+int
 main ()
 {
   pic_addr ();
diff --git a/gcc/testsuite/gcc.target/mips/mips-nonpic/mips-nonpic.h b/gcc/testsuite/gcc.target/mips/mips-nonpic/mips-nonpic.h
index e9fe9921885..f0bf3f9ea03 100644
--- a/gcc/testsuite/gcc.target/mips/mips-nonpic/mips-nonpic.h
+++ b/gcc/testsuite/gcc.target/mips/mips-nonpic/mips-nonpic.h
@@ -9,10 +9,14 @@ extern int hit_nonpic_call;
 extern int hit_nonpic_nothing;
 
 extern void nonpic_addr (void);
+extern void nonpic_call (void);
+extern void nonpic_addr_call (void);
 extern void nonpic_nothing (void);
 extern void nonpic_receive_fn_addr (void *);
 
 extern void pic_addr (void);
+extern void pic_call (void);
+extern void pic_addr_call (void);
 extern void pic_receive_fn_addr (void *);
 extern void pic_nothing (void);
 
diff --git a/gcc/testsuite/gcc.target/mips/mips-ps-type-2.c b/gcc/testsuite/gcc.target/mips/mips-ps-type-2.c
index f52cf91e81b..a4dfbaea645 100644
--- a/gcc/testsuite/gcc.target/mips/mips-ps-type-2.c
+++ b/gcc/testsuite/gcc.target/mips/mips-ps-type-2.c
@@ -1,7 +1,7 @@
 /* Test v2sf calculations.  The nmadd and nmsub patterns need
    -ffinite-math-only.  */
 /* { dg-do compile } */
-/* { dg-options "isa_rev>=2 -mgp32 -mpaired-single -ffinite-math-only" } */
+/* { dg-options "(HAS_MADDPS) -mgp32 -mpaired-single -ffinite-math-only" } */
 /* { dg-skip-if "nmadd and nmsub need combine" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler "\tcvt.ps.s\t" } } */
 /* { dg-final { scan-assembler "\tmov.ps\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/mips.exp b/gcc/testsuite/gcc.target/mips/mips.exp
index 8c72cff7223..abf66d33cb3 100644
--- a/gcc/testsuite/gcc.target/mips/mips.exp
+++ b/gcc/testsuite/gcc.target/mips/mips.exp
@@ -234,18 +234,31 @@ set mips_option_groups {
     dump_pattern "-dp"
     endianness "-E(L|B)|-me(l|b)"
     float "-m(hard|soft)-float"
+    fpu "-m(double|single)-float"
     forbid_cpu "forbid_cpu=.*"
-    fp "-mfp(32|64)"
+    fp "-mfp(32|xx|64)"
     gp "-mgp(32|64)"
     long "-mlong(32|64)"
     micromips "-mmicromips|-mno-micromips"
     mips16 "-mips16|-mno-mips16|-mflip-mips16"
     mips3d "-mips3d|-mno-mips3d"
     pic "-f(no-|)(pic|PIC)"
+    cb "-mcompact-branches=.*"
     profiling "-pg"
     small-data "-G[0-9]+"
     warnings "-w"
     dump "-fdump-.*"
+    ins "HAS_INS"
+    dmul "NOT_HAS_DMUL"
+    ldc "HAS_LDC"
+    movn "HAS_MOVN"
+    madd "HAS_MADD"
+    maddps "HAS_MADDPS"
+    lsa "(|!)HAS_LSA"
+}
+
+for { set option 0 } { $option < 32 } { incr option } {
+    lappend mips_option_groups "fixed-f$option" "-ffixed-f$option"
 }
 
 # Add -mfoo/-mno-foo options to mips_option_groups.
@@ -270,6 +283,8 @@ foreach option {
     synci
     relax-pic-calls
     mcount-ra-address
+    odd-spreg
+    msa
 } {
     lappend mips_option_groups $option "-m(no-|)$option"
 }
@@ -722,8 +737,12 @@ proc mips-dg-init {} {
 	    #if __mips_fpr == 64
 	    "-mfp64",
 	    #else
+	    #if __mips_fpr == 0
+	    "-mfpxx",
+	    #else
 	    "-mfp32",
 	    #endif
+	    #endif
 
 	    #ifdef __mips64
 	    "-mgp64",
@@ -755,6 +774,12 @@ proc mips-dg-init {} {
 	    "-mno-paired-single",
 	    #endif
 
+	    #if _MIPS_SPFPSET == 32
+	    "-modd-spreg",
+	    #else
+	    "-mno-odd-spreg",
+	    #endif
+
 	    #if __mips_abicalls
 	    "-mabicalls",
 	    #else
@@ -789,6 +814,12 @@ proc mips-dg-init {} {
 	    "-mno-synci",
 	    #endif
 
+	    #ifdef __mips_msa
+	    "-mmsa"
+	    #else
+	    "-mno-msa"
+	    #endif
+
 	    0
 	};
     }]
@@ -840,6 +871,10 @@ proc mips-dg-finish {} {
 #            |                           |
 #         -mfp64                      -mfp32
 #            |                           |
+#         -modd-spreg                 -mno-odd-spreg
+#            |                           |
+#         -mdouble-float              -msingle-float
+#            |                           |
 #         -mabs=2008/-mabs=legacy     <no option>
 #            |                           |
 #         -mhard-float                -msoft-float
@@ -929,6 +964,12 @@ proc mips-dg-options { args } {
     mips_option_dependency options "-mips3d" "-mpaired-single"
     mips_option_dependency options "-mpaired-single" "-mfp64"
     mips_option_dependency options "-mfp64" "-mhard-float"
+    mips_option_dependency options "-mfp32" "-mhard-float"
+    mips_option_dependency options "-mfpxx" "-mhard-float"
+    mips_option_dependency options "-mfp64" "-modd-spreg"
+    mips_option_dependency options "-mfp64" "-mdouble-float"
+    mips_option_dependency options "-mfp32" "-mdouble-float"
+    mips_option_dependency options "-mfpxx" "-mdouble-float"
     mips_option_dependency options "-mabs=2008" "-mhard-float"
     mips_option_dependency options "-mabs=legacy" "-mhard-float"
     mips_option_dependency options "-mrelax-pic-calls" "-mno-plt"
@@ -1028,14 +1069,29 @@ proc mips-dg-options { args } {
     # Handle dependencies between the pre-arch options and the arch option.
     # This should mirror the arch and post-arch code below.
     if { !$arch_test_option_p } {
+	# We need a revision 6 or better ISA for:
+	#
+	#   - When the LSA instruction is required
+	#   - When only using compact branches
+	if { $isa_rev < 6
+	     && ([mips_have_test_option_p options "HAS_LSA"]
+		 || [mips_have_test_option_p options "-mcompact-branches=always"]) } {
+	    if { $gp_size == 32 } {
+		mips_make_test_option options "-mips32r6"
+	    } else {
+		mips_make_test_option options "-mips64r6"
+	    }
 	# We need a revision 2 or better ISA for:
 	#
 	#   - the combination of -mgp32 -mfp64
 	#   - the DSP ASE
-	if { $isa_rev < 2
+	} elseif { $isa_rev < 2
 	     && (($gp_size == 32 && [mips_have_test_option_p options "-mfp64"])
 		 || [mips_have_test_option_p options "-msynci"]
 		 || [mips_have_test_option_p options "-mdsp"]
+		 || [mips_have_test_option_p options "HAS_INS"]
+		 || [mips_have_test_option_p options "HAS_MADD"]
+		 || [mips_have_test_option_p options "HAS_MADDPS"]
 		 || [mips_have_test_option_p options "-mdspr2"]) } {
 	    if { $gp_size == 32 } {
 		mips_make_test_option options "-mips32r2"
@@ -1045,15 +1101,24 @@ proc mips-dg-options { args } {
         # We need a MIPS32 or MIPS64 ISA for:
 	#
         #   - paired-single instructions(*)
+        #   - odd numbered single precision registers
         #
 	# (*) Note that we don't support MIPS V at the moment.
 	} elseif { $isa_rev < 1
-		   && [mips_have_test_option_p options "-mpaired-single"] } {
+		   && ([mips_have_test_option_p options "-mpaired-single"]
+		       || ([mips_have_test_option_p options "-modd-spreg"]
+			   && ![mips_have_test_option_p options "-mfp64"]))} {
 	    if { $gp_size == 32 } {
 		mips_make_test_option options "-mips32"
 	    } else {
 		mips_make_test_option options "-mips64"
 	    }
+        # We need MIPS IV or higher for:
+	#
+	#
+	} elseif { $isa < 3
+		   && [mips_have_test_option_p options "HAS_MOVN"] } {
+	    mips_make_test_option options "-mips4"
         # We need MIPS III or higher for:
 	#
 	#   - the "cache" instruction
@@ -1070,8 +1135,39 @@ proc mips-dg-options { args } {
 	# (*) needed by both -mbranch-likely and -mfix-r10000
 	} elseif { $isa < 2
 		   && ([mips_have_test_option_p options "-mbranch-likely"]
-		       || [mips_have_test_option_p options "-mfix-r10000"]) } {
+		       || [mips_have_test_option_p options "-mfix-r10000"]
+		       || ($gp_size == 32
+			   && ([mips_have_test_option_p options "-mfpxx"]
+			       || [mips_have_test_option_p options "HAS_LDC"]))) } {
 	    mips_make_test_option options "-mips2"
+	# We need to use octeon's base ISA if a test must not run with an
+	# architecture that supports dmul.
+	} elseif { [regexp -- "^-march=octeon.*\$" $arch]
+		   && [mips_have_test_option_p options "NOT_HAS_DMUL"] } {
+	    mips_make_test_option options "-mips${isa}r${isa_rev}"
+	# Check whether we need to switch from mips*r6 down to mips*r5 due
+	# to options that are incompatible with mips*r6.  If we do, use
+	# -mnan=2008 because r6 is nan2008 by default and without this flag
+	# tests that include stdlib.h will fail due to not finding
+	# stubs-o32_hard.h  (r6 compilers only have stubs-o32_hard_2008.h)
+	} elseif { $isa_rev > 5
+		   && ([mips_have_test_option_p options "-mips16"]
+		       || [mips_have_test_option_p options "-mmicromips"]
+		       || [mips_have_test_option_p options "-mfp32"]
+		       || [mips_have_test_option_p options "-mfix-r10000"]
+		       || [mips_have_test_option_p options "NOT_HAS_DMUL"]
+		       || [mips_have_test_option_p options "HAS_MOVN"]
+		       || [mips_have_test_option_p options "HAS_MADD"]
+		       || [mips_have_test_option_p options "-mpaired-single"]
+		       || [mips_have_test_option_p options "-mnan=legacy"]
+		       || [mips_have_test_option_p options "-mabs=legacy"]
+		       || [mips_have_test_option_p options "!HAS_LSA"]) } {
+	    if { $gp_size == 32 } {
+		mips_make_test_option options "-mips32r5"
+	    } else {
+		mips_make_test_option options "-mips64r5"
+	    }
+	    mips_make_test_option options "-mnan=2008"
 	# Check whether we need to switch from a 32-bit processor to the
 	# "nearest" 64-bit processor.
 	} elseif { $gp_size == 64 && [mips_32bit_arch_p $arch] } {
@@ -1096,6 +1192,10 @@ proc mips-dg-options { args } {
 	unset isa_rev
     }
 
+    # Re-calculate the isa_rev for use in the abi handling code below
+    set arch [mips_option options arch]
+    set isa_rev [mips_arch_info $arch isa_rev]
+
     # Set an appropriate ABI, handling dependencies between the pre-abi
     # options and the abi options.  This should mirror the abi and post-abi
     # code below.
@@ -1121,6 +1221,9 @@ proc mips-dg-options { args } {
 	} elseif { [mips_have_test_option_p options "-mlong64"]
 		   && [mips_long32_abi_p $abi] } {
 	    set force_abi 1
+	} elseif { [mips_have_test_option_p options "-mfpxx"]
+		   && ![mips_same_option_p $abi "-mabi=32"] } {
+	    set force_abi 1
 	} else {
 	    set force_abi 0
 	}
@@ -1157,8 +1260,8 @@ proc mips-dg-options { args } {
     if { $abi_test_option_p } {
 	if { $eabi_p } {
 	    mips_make_test_option options "-mno-abicalls"
-	    if { $gp_size == 32 } {
-		mips_make_test_option options "-mfp32"
+	    if { $isa_rev < 6 && $gp_size == 32 } {
+		    mips_make_test_option options "-mfp32"
 	    }
 	}
 	if { [mips_using_mips16_p options]
@@ -1192,6 +1295,9 @@ proc mips-dg-options { args } {
 	}
 	if { $isa_rev < 1 } {
 	    mips_make_test_option options "-mno-paired-single"
+	    if { ![mips_have_test_option_p options "-mgp64"] } {
+		mips_make_test_option options "-mno-odd-spreg"
+	    }
 	}
 	if { $isa_rev < 2 } {
 	    if { $gp_size == 32 } {
@@ -1199,6 +1305,17 @@ proc mips-dg-options { args } {
 	    }
 	    mips_make_test_option options "-mno-dsp"
 	    mips_make_test_option options "-mno-synci"
+	    mips_make_test_option options "-mnan=legacy"
+	}
+        if { $isa_rev > 5 } {
+	    mips_make_test_option options "-mno-mips16"
+	    if { [mips_have_test_option_p options "-mdsp"] } {
+		mips_make_test_option options "-mfp64"
+	    }
+	    mips_make_test_option options "-mno-fix-r10000"
+	    mips_make_test_option options "-mno-paired-single"
+	    mips_make_test_option options "-mnan=2008"
+	    mips_make_test_option options "-mabs=2008"
 	}
 	unset arch
 	unset isa
@@ -1222,6 +1339,7 @@ proc mips-dg-options { args } {
     mips_option_dependency options "-mplt" "-mno-shared"
     mips_option_dependency options "-mno-shared" "-fno-pic"
     mips_option_dependency options "-mfp32" "-mno-paired-single"
+    mips_option_dependency options "-mfpxx" "-mno-paired-single"
     mips_option_dependency options "-msoft-float" "-mno-paired-single"
     mips_option_dependency options "-mno-paired-single" "-mno-mips3d"
 
@@ -1243,7 +1361,9 @@ proc mips-dg-options { args } {
     foreach group $mips_abi_groups {
 	set old_option [mips_original_option $group]
 	set new_option [mips_option options $group]
-	if { ![mips_same_option_p $old_option $new_option] } {
+	if { ![mips_same_option_p $old_option $new_option]
+	     && ![mips_same_option_p $old_option "-mfpxx"]
+	     && ![mips_same_option_p $new_option "-mfpxx"] } {
 	    switch -- [lindex $do_what 0] {
 		link -
 		run {
diff --git a/gcc/testsuite/gcc.target/mips/mips16e-extends.c b/gcc/testsuite/gcc.target/mips/mips16e-extends.c
index d8946c979ae..ad5ba344cb6 100644
--- a/gcc/testsuite/gcc.target/mips/mips16e-extends.c
+++ b/gcc/testsuite/gcc.target/mips/mips16e-extends.c
@@ -2,6 +2,8 @@
 /* { dg-options "(-mips16) isa_rev>=1 -mlong32" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 
+extern int validate ();
+
 MIPS16 short cksum16 (unsigned long n)
 {
   unsigned long l;
diff --git a/gcc/testsuite/gcc.target/mips/mips64-dsp-ldx.c b/gcc/testsuite/gcc.target/mips/mips64-dsp-ldx.c
index 02e6166577d..7370ad93c3e 100644
--- a/gcc/testsuite/gcc.target/mips/mips64-dsp-ldx.c
+++ b/gcc/testsuite/gcc.target/mips/mips64-dsp-ldx.c
@@ -1,6 +1,6 @@
 /* Test MIPS64 DSP instructions */
 /* { dg-do compile } */
-/* { dg-options "-mgp64 -mdsp" } */
+/* { dg-options "-mgp64 -mdsp (!HAS_LSA)" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 
 /* { dg-final { scan-assembler "\tldx\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/mips64-lsa.c b/gcc/testsuite/gcc.target/mips/mips64-lsa.c
new file mode 100644
index 00000000000..7d77bca47e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/mips64-lsa.c
@@ -0,0 +1,11 @@
+/* Test MIPS64 DSP instructions - should use LSA instead of LHX */
+/* { dg-do compile } */
+/* { dg-options "-mabi=64 (HAS_LSA)" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+
+/* { dg-final { scan-assembler "\tdlsa\t" } } */
+
+NOMIPS16 signed long long test (signed long long *a, int index)
+{
+  return a[index];
+}
diff --git a/gcc/testsuite/gcc.target/mips/movcc-1.c b/gcc/testsuite/gcc.target/mips/movcc-1.c
index b3fe188d2c0..7943fecbcef 100644
--- a/gcc/testsuite/gcc.target/mips/movcc-1.c
+++ b/gcc/testsuite/gcc.target/mips/movcc-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "isa>=4" } */
+/* { dg-options "(HAS_MOVN)" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler "\tmovz\t" } } */
 /* { dg-final { scan-assembler "\tmovn\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/movcc-2.c b/gcc/testsuite/gcc.target/mips/movcc-2.c
index 2638d51fd6c..1926e6460d1 100644
--- a/gcc/testsuite/gcc.target/mips/movcc-2.c
+++ b/gcc/testsuite/gcc.target/mips/movcc-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "isa>=4" } */
+/* { dg-options "(HAS_MOVN)" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler "\tmovz\t" } } */
 /* { dg-final { scan-assembler "\tmovn\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/movcc-3.c b/gcc/testsuite/gcc.target/mips/movcc-3.c
index f356465c887..55434b72c72 100644
--- a/gcc/testsuite/gcc.target/mips/movcc-3.c
+++ b/gcc/testsuite/gcc.target/mips/movcc-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "isa>=4 -mhard-float" } */
+/* { dg-options "(HAS_MOVN) -mhard-float" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler "\tmovt\t" } } */
 /* { dg-final { scan-assembler "\tmovf\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/movdf-1.c b/gcc/testsuite/gcc.target/mips/movdf-1.c
new file mode 100644
index 00000000000..f0267d00e97
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/movdf-1.c
@@ -0,0 +1,14 @@
+/* Check that we move DFmode values via memory between FP and GP.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -mfpxx isa=2" } */
+
+void bar (void);
+
+double
+foo (int x, double a)
+{
+  return a;
+}
+/* { dg-final { scan-assembler-not "mthc1" } } */
+/* { dg-final { scan-assembler-not "mtc1" } } */
+/* { dg-final { scan-assembler-times "ldc1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/mips/movdf-2.c b/gcc/testsuite/gcc.target/mips/movdf-2.c
new file mode 100644
index 00000000000..175b61c7e77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/movdf-2.c
@@ -0,0 +1,14 @@
+/* Check that we move DFmode values using mthc between FP and GP.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -mfpxx isa_rev=2" } */
+
+void bar (void);
+
+double
+foo (int x, double a)
+{
+  return a;
+}
+/* { dg-final { scan-assembler "mthc1" } } */
+/* { dg-final { scan-assembler "mtc1" } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
diff --git a/gcc/testsuite/gcc.target/mips/movdf-3.c b/gcc/testsuite/gcc.target/mips/movdf-3.c
new file mode 100644
index 00000000000..5db52c9487b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/movdf-3.c
@@ -0,0 +1,13 @@
+/* Check that we move DFmode values using mtc1 between FP and GP.  */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-options "-mabi=32 -mfp32 isa=2" } */
+
+void bar (void);
+
+double
+foo (int x, double a)
+{
+  return a;
+}
+/* { dg-final { scan-assembler-times "mtc1" 2 } } */
+/* { dg-final { scan-assembler-not "ldc1" } } */
diff --git a/gcc/testsuite/gcc.target/mips/msa-builtins.c b/gcc/testsuite/gcc.target/mips/msa-builtins.c
new file mode 100644
index 00000000000..397c814c26d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/msa-builtins.c
@@ -0,0 +1,1083 @@
+/* Test builtins for MIPS MSA ASE instructions */
+/* { dg-do compile } */
+/* { dg-options "-mfp64 -mhard-float -mmsa" } */
+
+/* { dg-final { scan-assembler "msa_addv_b.*:.*addv\\.b.*msa_addv_b" } } */
+/* { dg-final { scan-assembler "msa_addv_h.*:.*addv\\.h.*msa_addv_h" } } */
+/* { dg-final { scan-assembler "msa_addv_w.*:.*addv\\.w.*msa_addv_w" } } */
+/* { dg-final { scan-assembler "msa_addv_d.*:.*addv\\.d.*msa_addv_d" } } */
+/* { dg-final { scan-assembler "msa_addvi_b.*:.*addvi\\.b.*msa_addvi_b" } } */
+/* { dg-final { scan-assembler "msa_addvi_h.*:.*addvi\\.h.*msa_addvi_h" } } */
+/* { dg-final { scan-assembler "msa_addvi_w.*:.*addvi\\.w.*msa_addvi_w" } } */
+/* { dg-final { scan-assembler "msa_addvi_d.*:.*addvi\\.d.*msa_addvi_d" } } */
+/* { dg-final { scan-assembler "msa_add_a_b.*:.*add_a\\.b.*msa_add_a_b" } } */
+/* { dg-final { scan-assembler "msa_add_a_h.*:.*add_a\\.h.*msa_add_a_h" } } */
+/* { dg-final { scan-assembler "msa_add_a_w.*:.*add_a\\.w.*msa_add_a_w" } } */
+/* { dg-final { scan-assembler "msa_add_a_d.*:.*add_a\\.d.*msa_add_a_d" } } */
+/* { dg-final { scan-assembler "msa_adds_a_b.*:.*adds_a\\.b.*msa_adds_a_b" } } */
+/* { dg-final { scan-assembler "msa_adds_a_h.*:.*adds_a\\.h.*msa_adds_a_h" } } */
+/* { dg-final { scan-assembler "msa_adds_a_w.*:.*adds_a\\.w.*msa_adds_a_w" } } */
+/* { dg-final { scan-assembler "msa_adds_a_d.*:.*adds_a\\.d.*msa_adds_a_d" } } */
+/* { dg-final { scan-assembler "msa_adds_s_b.*:.*adds_s\\.b.*msa_adds_s_b" } } */
+/* { dg-final { scan-assembler "msa_adds_s_h.*:.*adds_s\\.h.*msa_adds_s_h" } } */
+/* { dg-final { scan-assembler "msa_adds_s_w.*:.*adds_s\\.w.*msa_adds_s_w" } } */
+/* { dg-final { scan-assembler "msa_adds_s_d.*:.*adds_s\\.d.*msa_adds_s_d" } } */
+/* { dg-final { scan-assembler "msa_adds_u_b.*:.*adds_u\\.b.*msa_adds_u_b" } } */
+/* { dg-final { scan-assembler "msa_adds_u_h.*:.*adds_u\\.h.*msa_adds_u_h" } } */
+/* { dg-final { scan-assembler "msa_adds_u_w.*:.*adds_u\\.w.*msa_adds_u_w" } } */
+/* { dg-final { scan-assembler "msa_adds_u_d.*:.*adds_u\\.d.*msa_adds_u_d" } } */
+/* { dg-final { scan-assembler "msa_hadd_s_h.*:.*hadd_s\\.h.*msa_hadd_s_h" } } */
+/* { dg-final { scan-assembler "msa_hadd_s_w.*:.*hadd_s\\.w.*msa_hadd_s_w" } } */
+/* { dg-final { scan-assembler "msa_hadd_s_d.*:.*hadd_s\\.d.*msa_hadd_s_d" } } */
+/* { dg-final { scan-assembler "msa_hadd_u_h.*:.*hadd_u\\.h.*msa_hadd_u_h" } } */
+/* { dg-final { scan-assembler "msa_hadd_u_w.*:.*hadd_u\\.w.*msa_hadd_u_w" } } */
+/* { dg-final { scan-assembler "msa_hadd_u_d.*:.*hadd_u\\.d.*msa_hadd_u_d" } } */
+/* { dg-final { scan-assembler "msa_asub_s_b.*:.*asub_s\\.b.*msa_asub_s_b" } } */
+/* { dg-final { scan-assembler "msa_asub_s_h.*:.*asub_s\\.h.*msa_asub_s_h" } } */
+/* { dg-final { scan-assembler "msa_asub_s_w.*:.*asub_s\\.w.*msa_asub_s_w" } } */
+/* { dg-final { scan-assembler "msa_asub_s_d.*:.*asub_s\\.d.*msa_asub_s_d" } } */
+/* { dg-final { scan-assembler "msa_asub_u_b.*:.*asub_u\\.b.*msa_asub_u_b" } } */
+/* { dg-final { scan-assembler "msa_asub_u_h.*:.*asub_u\\.h.*msa_asub_u_h" } } */
+/* { dg-final { scan-assembler "msa_asub_u_w.*:.*asub_u\\.w.*msa_asub_u_w" } } */
+/* { dg-final { scan-assembler "msa_asub_u_d.*:.*asub_u\\.d.*msa_asub_u_d" } } */
+/* { dg-final { scan-assembler "msa_ave_s_b.*:.*ave_s\\.b.*msa_ave_s_b" } } */
+/* { dg-final { scan-assembler "msa_ave_s_h.*:.*ave_s\\.h.*msa_ave_s_h" } } */
+/* { dg-final { scan-assembler "msa_ave_s_w.*:.*ave_s\\.w.*msa_ave_s_w" } } */
+/* { dg-final { scan-assembler "msa_ave_s_d.*:.*ave_s\\.d.*msa_ave_s_d" } } */
+/* { dg-final { scan-assembler "msa_ave_u_b.*:.*ave_u\\.b.*msa_ave_u_b" } } */
+/* { dg-final { scan-assembler "msa_ave_u_h.*:.*ave_u\\.h.*msa_ave_u_h" } } */
+/* { dg-final { scan-assembler "msa_ave_u_w.*:.*ave_u\\.w.*msa_ave_u_w" } } */
+/* { dg-final { scan-assembler "msa_ave_u_d.*:.*ave_u\\.d.*msa_ave_u_d" } } */
+/* { dg-final { scan-assembler "msa_aver_s_b.*:.*aver_s\\.b.*msa_aver_s_b" } } */
+/* { dg-final { scan-assembler "msa_aver_s_h.*:.*aver_s\\.h.*msa_aver_s_h" } } */
+/* { dg-final { scan-assembler "msa_aver_s_w.*:.*aver_s\\.w.*msa_aver_s_w" } } */
+/* { dg-final { scan-assembler "msa_aver_s_d.*:.*aver_s\\.d.*msa_aver_s_d" } } */
+/* { dg-final { scan-assembler "msa_aver_u_b.*:.*aver_u\\.b.*msa_aver_u_b" } } */
+/* { dg-final { scan-assembler "msa_aver_u_h.*:.*aver_u\\.h.*msa_aver_u_h" } } */
+/* { dg-final { scan-assembler "msa_aver_u_w.*:.*aver_u\\.w.*msa_aver_u_w" } } */
+/* { dg-final { scan-assembler "msa_aver_u_d.*:.*aver_u\\.d.*msa_aver_u_d" } } */
+/* { dg-final { scan-assembler "msa_dotp_s_h.*:.*dotp_s\\.h.*msa_dotp_s_h" } } */
+/* { dg-final { scan-assembler "msa_dotp_s_w.*:.*dotp_s\\.w.*msa_dotp_s_w" } } */
+/* { dg-final { scan-assembler "msa_dotp_s_d.*:.*dotp_s\\.d.*msa_dotp_s_d" } } */
+/* { dg-final { scan-assembler "msa_dotp_u_h.*:.*dotp_u\\.h.*msa_dotp_u_h" } } */
+/* { dg-final { scan-assembler "msa_dotp_u_w.*:.*dotp_u\\.w.*msa_dotp_u_w" } } */
+/* { dg-final { scan-assembler "msa_dotp_u_d.*:.*dotp_u\\.d.*msa_dotp_u_d" } } */
+/* { dg-final { scan-assembler "msa_dpadd_s_h.*:.*dpadd_s\\.h.*msa_dpadd_s_h" } } */
+/* { dg-final { scan-assembler "msa_dpadd_s_w.*:.*dpadd_s\\.w.*msa_dpadd_s_w" } } */
+/* { dg-final { scan-assembler "msa_dpadd_s_d.*:.*dpadd_s\\.d.*msa_dpadd_s_d" } } */
+/* { dg-final { scan-assembler "msa_dpadd_u_h.*:.*dpadd_u\\.h.*msa_dpadd_u_h" } } */
+/* { dg-final { scan-assembler "msa_dpadd_u_w.*:.*dpadd_u\\.w.*msa_dpadd_u_w" } } */
+/* { dg-final { scan-assembler "msa_dpadd_u_d.*:.*dpadd_u\\.d.*msa_dpadd_u_d" } } */
+/* { dg-final { scan-assembler "msa_dpsub_s_h.*:.*dpsub_s\\.h.*msa_dpsub_s_h" } } */
+/* { dg-final { scan-assembler "msa_dpsub_s_w.*:.*dpsub_s\\.w.*msa_dpsub_s_w" } } */
+/* { dg-final { scan-assembler "msa_dpsub_s_d.*:.*dpsub_s\\.d.*msa_dpsub_s_d" } } */
+/* { dg-final { scan-assembler "msa_dpsub_u_h.*:.*dpsub_u\\.h.*msa_dpsub_u_h" } } */
+/* { dg-final { scan-assembler "msa_dpsub_u_w.*:.*dpsub_u\\.w.*msa_dpsub_u_w" } } */
+/* { dg-final { scan-assembler "msa_dpsub_u_d.*:.*dpsub_u\\.d.*msa_dpsub_u_d" } } */
+/* { dg-final { scan-assembler "msa_div_s_b.*:.*div_s\\.b.*msa_div_s_b" } } */
+/* { dg-final { scan-assembler "msa_div_s_h.*:.*div_s\\.h.*msa_div_s_h" } } */
+/* { dg-final { scan-assembler "msa_div_s_w.*:.*div_s\\.w.*msa_div_s_w" } } */
+/* { dg-final { scan-assembler "msa_div_s_d.*:.*div_s\\.d.*msa_div_s_d" } } */
+/* { dg-final { scan-assembler "msa_div_u_b.*:.*div_u\\.b.*msa_div_u_b" } } */
+/* { dg-final { scan-assembler "msa_div_u_h.*:.*div_u\\.h.*msa_div_u_h" } } */
+/* { dg-final { scan-assembler "msa_div_u_w.*:.*div_u\\.w.*msa_div_u_w" } } */
+/* { dg-final { scan-assembler "msa_div_u_d.*:.*div_u\\.d.*msa_div_u_d" } } */
+/* { dg-final { scan-assembler "msa_maddv_b.*:.*maddv\\.b.*msa_maddv_b" } } */
+/* { dg-final { scan-assembler "msa_maddv_h.*:.*maddv\\.h.*msa_maddv_h" } } */
+/* { dg-final { scan-assembler "msa_maddv_w.*:.*maddv\\.w.*msa_maddv_w" } } */
+/* { dg-final { scan-assembler "msa_maddv_d.*:.*maddv\\.d.*msa_maddv_d" } } */
+/* { dg-final { scan-assembler "msa_max_a_b.*:.*max_a\\.b.*msa_max_a_b" } } */
+/* { dg-final { scan-assembler "msa_max_a_h.*:.*max_a\\.h.*msa_max_a_h" } } */
+/* { dg-final { scan-assembler "msa_max_a_w.*:.*max_a\\.w.*msa_max_a_w" } } */
+/* { dg-final { scan-assembler "msa_max_a_d.*:.*max_a\\.d.*msa_max_a_d" } } */
+/* { dg-final { scan-assembler "msa_min_a_b.*:.*min_a\\.b.*msa_min_a_b" } } */
+/* { dg-final { scan-assembler "msa_min_a_h.*:.*min_a\\.h.*msa_min_a_h" } } */
+/* { dg-final { scan-assembler "msa_min_a_w.*:.*min_a\\.w.*msa_min_a_w" } } */
+/* { dg-final { scan-assembler "msa_min_a_d.*:.*min_a\\.d.*msa_min_a_d" } } */
+/* { dg-final { scan-assembler "msa_max_s_b.*:.*max_s\\.b.*msa_max_s_b" } } */
+/* { dg-final { scan-assembler "msa_max_s_h.*:.*max_s\\.h.*msa_max_s_h" } } */
+/* { dg-final { scan-assembler "msa_max_s_w.*:.*max_s\\.w.*msa_max_s_w" } } */
+/* { dg-final { scan-assembler "msa_max_s_d.*:.*max_s\\.d.*msa_max_s_d" } } */
+/* { dg-final { scan-assembler "msa_maxi_s_b.*:.*maxi_s\\.b.*msa_maxi_s_b" } } */
+/* { dg-final { scan-assembler "msa_maxi_s_h.*:.*maxi_s\\.h.*msa_maxi_s_h" } } */
+/* { dg-final { scan-assembler "msa_maxi_s_w.*:.*maxi_s\\.w.*msa_maxi_s_w" } } */
+/* { dg-final { scan-assembler "msa_maxi_s_d.*:.*maxi_s\\.d.*msa_maxi_s_d" } } */
+/* { dg-final { scan-assembler "msa_max_u_b.*:.*max_u\\.b.*msa_max_u_b" } } */
+/* { dg-final { scan-assembler "msa_max_u_h.*:.*max_u\\.h.*msa_max_u_h" } } */
+/* { dg-final { scan-assembler "msa_max_u_w.*:.*max_u\\.w.*msa_max_u_w" } } */
+/* { dg-final { scan-assembler "msa_max_u_d.*:.*max_u\\.d.*msa_max_u_d" } } */
+/* { dg-final { scan-assembler "msa_maxi_u_b.*:.*maxi_u\\.b.*msa_maxi_u_b" } } */
+/* { dg-final { scan-assembler "msa_maxi_u_h.*:.*maxi_u\\.h.*msa_maxi_u_h" } } */
+/* { dg-final { scan-assembler "msa_maxi_u_w.*:.*maxi_u\\.w.*msa_maxi_u_w" } } */
+/* { dg-final { scan-assembler "msa_maxi_u_d.*:.*maxi_u\\.d.*msa_maxi_u_d" } } */
+/* { dg-final { scan-assembler "msa_min_s_b.*:.*min_s\\.b.*msa_min_s_b" } } */
+/* { dg-final { scan-assembler "msa_min_s_h.*:.*min_s\\.h.*msa_min_s_h" } } */
+/* { dg-final { scan-assembler "msa_min_s_w.*:.*min_s\\.w.*msa_min_s_w" } } */
+/* { dg-final { scan-assembler "msa_min_s_d.*:.*min_s\\.d.*msa_min_s_d" } } */
+/* { dg-final { scan-assembler "msa_mini_s_b.*:.*mini_s\\.b.*msa_mini_s_b" } } */
+/* { dg-final { scan-assembler "msa_mini_s_h.*:.*mini_s\\.h.*msa_mini_s_h" } } */
+/* { dg-final { scan-assembler "msa_mini_s_w.*:.*mini_s\\.w.*msa_mini_s_w" } } */
+/* { dg-final { scan-assembler "msa_mini_s_d.*:.*mini_s\\.d.*msa_mini_s_d" } } */
+/* { dg-final { scan-assembler "msa_min_u_b.*:.*min_u\\.b.*msa_min_u_b" } } */
+/* { dg-final { scan-assembler "msa_min_u_h.*:.*min_u\\.h.*msa_min_u_h" } } */
+/* { dg-final { scan-assembler "msa_min_u_w.*:.*min_u\\.w.*msa_min_u_w" } } */
+/* { dg-final { scan-assembler "msa_min_u_d.*:.*min_u\\.d.*msa_min_u_d" } } */
+/* { dg-final { scan-assembler "msa_mini_u_b.*:.*mini_u\\.b.*msa_mini_u_b" } } */
+/* { dg-final { scan-assembler "msa_mini_u_h.*:.*mini_u\\.h.*msa_mini_u_h" } } */
+/* { dg-final { scan-assembler "msa_mini_u_w.*:.*mini_u\\.w.*msa_mini_u_w" } } */
+/* { dg-final { scan-assembler "msa_mini_u_d.*:.*mini_u\\.d.*msa_mini_u_d" } } */
+/* { dg-final { scan-assembler "msa_msubv_b.*:.*msubv\\.b.*msa_msubv_b" } } */
+/* { dg-final { scan-assembler "msa_msubv_h.*:.*msubv\\.h.*msa_msubv_h" } } */
+/* { dg-final { scan-assembler "msa_msubv_w.*:.*msubv\\.w.*msa_msubv_w" } } */
+/* { dg-final { scan-assembler "msa_msubv_d.*:.*msubv\\.d.*msa_msubv_d" } } */
+/* { dg-final { scan-assembler "msa_mulv_b.*:.*mulv\\.b.*msa_mulv_b" } } */
+/* { dg-final { scan-assembler "msa_mulv_h.*:.*mulv\\.h.*msa_mulv_h" } } */
+/* { dg-final { scan-assembler "msa_mulv_w.*:.*mulv\\.w.*msa_mulv_w" } } */
+/* { dg-final { scan-assembler "msa_mulv_d.*:.*mulv\\.d.*msa_mulv_d" } } */
+/* { dg-final { scan-assembler "msa_mod_s_b.*:.*mod_s\\.b.*msa_mod_s_b" } } */
+/* { dg-final { scan-assembler "msa_mod_s_h.*:.*mod_s\\.h.*msa_mod_s_h" } } */
+/* { dg-final { scan-assembler "msa_mod_s_w.*:.*mod_s\\.w.*msa_mod_s_w" } } */
+/* { dg-final { scan-assembler "msa_mod_s_d.*:.*mod_s\\.d.*msa_mod_s_d" } } */
+/* { dg-final { scan-assembler "msa_mod_u_b.*:.*mod_u\\.b.*msa_mod_u_b" } } */
+/* { dg-final { scan-assembler "msa_mod_u_h.*:.*mod_u\\.h.*msa_mod_u_h" } } */
+/* { dg-final { scan-assembler "msa_mod_u_w.*:.*mod_u\\.w.*msa_mod_u_w" } } */
+/* { dg-final { scan-assembler "msa_mod_u_d.*:.*mod_u\\.d.*msa_mod_u_d" } } */
+/* { dg-final { scan-assembler "msa_sat_s_b.*:.*sat_s\\.b.*msa_sat_s_b" } } */
+/* { dg-final { scan-assembler "msa_sat_s_h.*:.*sat_s\\.h.*msa_sat_s_h" } } */
+/* { dg-final { scan-assembler "msa_sat_s_w.*:.*sat_s\\.w.*msa_sat_s_w" } } */
+/* { dg-final { scan-assembler "msa_sat_s_d.*:.*sat_s\\.d.*msa_sat_s_d" } } */
+/* { dg-final { scan-assembler "msa_sat_u_b.*:.*sat_u\\.b.*msa_sat_u_b" } } */
+/* { dg-final { scan-assembler "msa_sat_u_h.*:.*sat_u\\.h.*msa_sat_u_h" } } */
+/* { dg-final { scan-assembler "msa_sat_u_w.*:.*sat_u\\.w.*msa_sat_u_w" } } */
+/* { dg-final { scan-assembler "msa_sat_u_d.*:.*sat_u\\.d.*msa_sat_u_d" } } */
+/* { dg-final { scan-assembler "msa_subs_s_b.*:.*subs_s\\.b.*msa_subs_s_b" } } */
+/* { dg-final { scan-assembler "msa_subs_s_h.*:.*subs_s\\.h.*msa_subs_s_h" } } */
+/* { dg-final { scan-assembler "msa_subs_s_w.*:.*subs_s\\.w.*msa_subs_s_w" } } */
+/* { dg-final { scan-assembler "msa_subs_s_d.*:.*subs_s\\.d.*msa_subs_s_d" } } */
+/* { dg-final { scan-assembler "msa_subs_u_b.*:.*subs_u\\.b.*msa_subs_u_b" } } */
+/* { dg-final { scan-assembler "msa_subs_u_h.*:.*subs_u\\.h.*msa_subs_u_h" } } */
+/* { dg-final { scan-assembler "msa_subs_u_w.*:.*subs_u\\.w.*msa_subs_u_w" } } */
+/* { dg-final { scan-assembler "msa_subs_u_d.*:.*subs_u\\.d.*msa_subs_u_d" } } */
+/* { dg-final { scan-assembler "msa_hsub_s_h.*:.*hsub_s\\.h.*msa_hsub_s_h" } } */
+/* { dg-final { scan-assembler "msa_hsub_s_w.*:.*hsub_s\\.w.*msa_hsub_s_w" } } */
+/* { dg-final { scan-assembler "msa_hsub_s_d.*:.*hsub_s\\.d.*msa_hsub_s_d" } } */
+/* { dg-final { scan-assembler "msa_hsub_u_h.*:.*hsub_u\\.h.*msa_hsub_u_h" } } */
+/* { dg-final { scan-assembler "msa_hsub_u_w.*:.*hsub_u\\.w.*msa_hsub_u_w" } } */
+/* { dg-final { scan-assembler "msa_hsub_u_d.*:.*hsub_u\\.d.*msa_hsub_u_d" } } */
+/* { dg-final { scan-assembler "msa_subsuu_s_b.*:.*subsuu_s\\.b.*msa_subsuu_s_b" } } */
+/* { dg-final { scan-assembler "msa_subsuu_s_h.*:.*subsuu_s\\.h.*msa_subsuu_s_h" } } */
+/* { dg-final { scan-assembler "msa_subsuu_s_w.*:.*subsuu_s\\.w.*msa_subsuu_s_w" } } */
+/* { dg-final { scan-assembler "msa_subsuu_s_d.*:.*subsuu_s\\.d.*msa_subsuu_s_d" } } */
+/* { dg-final { scan-assembler "msa_subsus_u_b.*:.*subsus_u\\.b.*msa_subsus_u_b" } } */
+/* { dg-final { scan-assembler "msa_subsus_u_h.*:.*subsus_u\\.h.*msa_subsus_u_h" } } */
+/* { dg-final { scan-assembler "msa_subsus_u_w.*:.*subsus_u\\.w.*msa_subsus_u_w" } } */
+/* { dg-final { scan-assembler "msa_subsus_u_d.*:.*subsus_u\\.d.*msa_subsus_u_d" } } */
+/* { dg-final { scan-assembler "msa_subv_b.*:.*subv\\.b.*msa_subv_b" } } */
+/* { dg-final { scan-assembler "msa_subv_h.*:.*subv\\.h.*msa_subv_h" } } */
+/* { dg-final { scan-assembler "msa_subv_w.*:.*subv\\.w.*msa_subv_w" } } */
+/* { dg-final { scan-assembler "msa_subv_d.*:.*subv\\.d.*msa_subv_d" } } */
+/* { dg-final { scan-assembler "msa_subvi_b.*:.*subvi\\.b.*msa_subvi_b" } } */
+/* { dg-final { scan-assembler "msa_subvi_h.*:.*subvi\\.h.*msa_subvi_h" } } */
+/* { dg-final { scan-assembler "msa_subvi_w.*:.*subvi\\.w.*msa_subvi_w" } } */
+/* { dg-final { scan-assembler "msa_subvi_d.*:.*subvi\\.d.*msa_subvi_d" } } */
+/* { dg-final { scan-assembler "msa_and_v.*:.*and\\.v.*msa_and_v" } } */
+/* { dg-final { scan-assembler "msa_andi_b.*:.*andi\\.b.*msa_andi_b" } } */
+/* { dg-final { scan-assembler "msa_bclr_b.*:.*bclr\\.b.*msa_bclr_b" } } */
+/* { dg-final { scan-assembler "msa_bclr_h.*:.*bclr\\.h.*msa_bclr_h" } } */
+/* { dg-final { scan-assembler "msa_bclr_w.*:.*bclr\\.w.*msa_bclr_w" } } */
+/* { dg-final { scan-assembler "msa_bclr_d.*:.*bclr\\.d.*msa_bclr_d" } } */
+/* { dg-final { scan-assembler "msa_bclri_b.*:.*bclri\\.b.*msa_bclri_b" } } */
+/* { dg-final { scan-assembler "msa_bclri_h.*:.*bclri\\.h.*msa_bclri_h" } } */
+/* { dg-final { scan-assembler "msa_bclri_w.*:.*bclri\\.w.*msa_bclri_w" } } */
+/* { dg-final { scan-assembler "msa_bclri_d.*:.*bclri\\.d.*msa_bclri_d" } } */
+/* { dg-final { scan-assembler "msa_binsl_b.*:.*binsl\\.b.*msa_binsl_b" } } */
+/* { dg-final { scan-assembler "msa_binsl_h.*:.*binsl\\.h.*msa_binsl_h" } } */
+/* { dg-final { scan-assembler "msa_binsl_w.*:.*binsl\\.w.*msa_binsl_w" } } */
+/* { dg-final { scan-assembler "msa_binsl_d.*:.*binsl\\.d.*msa_binsl_d" } } */
+/* { dg-final { scan-assembler "msa_binsli_b.*:.*binsli\\.b.*msa_binsli_b" } } */
+/* { dg-final { scan-assembler "msa_binsli_h.*:.*binsli\\.h.*msa_binsli_h" } } */
+/* { dg-final { scan-assembler "msa_binsli_w.*:.*binsli\\.w.*msa_binsli_w" } } */
+/* { dg-final { scan-assembler "msa_binsli_d.*:.*binsli\\.d.*msa_binsli_d" } } */
+/* { dg-final { scan-assembler "msa_binsr_b.*:.*binsr\\.b.*msa_binsr_b" } } */
+/* { dg-final { scan-assembler "msa_binsr_h.*:.*binsr\\.h.*msa_binsr_h" } } */
+/* { dg-final { scan-assembler "msa_binsr_w.*:.*binsr\\.w.*msa_binsr_w" } } */
+/* { dg-final { scan-assembler "msa_binsr_d.*:.*binsr\\.d.*msa_binsr_d" } } */
+/* { dg-final { scan-assembler "msa_binsri_b.*:.*binsri\\.b.*msa_binsri_b" } } */
+/* { dg-final { scan-assembler "msa_binsri_h.*:.*binsri\\.h.*msa_binsri_h" } } */
+/* { dg-final { scan-assembler "msa_binsri_w.*:.*binsri\\.w.*msa_binsri_w" } } */
+/* { dg-final { scan-assembler "msa_binsri_d.*:.*binsri\\.d.*msa_binsri_d" } } */
+/* { dg-final { scan-assembler "msa_bmnz_v.*:.*bmnz\\.v.*msa_bmnz_v" } } */
+/* { dg-final { scan-assembler "msa_bmnzi_b.*:.*bmnzi\\.b.*msa_bmnzi_b" } } */
+/* { dg-final { scan-assembler "msa_bmz_v.*:.*bmz\\.v.*msa_bmz_v" } } */
+/* { dg-final { scan-assembler "msa_bmzi_b.*:.*bmzi\\.b.*msa_bmzi_b" } } */
+/* { dg-final { scan-assembler "msa_bneg_b.*:.*bneg\\.b.*msa_bneg_b" } } */
+/* { dg-final { scan-assembler "msa_bneg_h.*:.*bneg\\.h.*msa_bneg_h" } } */
+/* { dg-final { scan-assembler "msa_bneg_w.*:.*bneg\\.w.*msa_bneg_w" } } */
+/* { dg-final { scan-assembler "msa_bneg_d.*:.*bneg\\.d.*msa_bneg_d" } } */
+/* { dg-final { scan-assembler "msa_bnegi_b.*:.*bnegi\\.b.*msa_bnegi_b" } } */
+/* { dg-final { scan-assembler "msa_bnegi_h.*:.*bnegi\\.h.*msa_bnegi_h" } } */
+/* { dg-final { scan-assembler "msa_bnegi_w.*:.*bnegi\\.w.*msa_bnegi_w" } } */
+/* { dg-final { scan-assembler "msa_bnegi_d.*:.*bnegi\\.d.*msa_bnegi_d" } } */
+/* { dg-final { scan-assembler "msa_bsel_v.*:.*bsel\\.v.*msa_bsel_v" } } */
+/* { dg-final { scan-assembler "msa_bseli_b.*:.*bseli\\.b.*msa_bseli_b" } } */
+/* { dg-final { scan-assembler "msa_bset_b.*:.*bset\\.b.*msa_bset_b" } } */
+/* { dg-final { scan-assembler "msa_bset_h.*:.*bset\\.h.*msa_bset_h" } } */
+/* { dg-final { scan-assembler "msa_bset_w.*:.*bset\\.w.*msa_bset_w" } } */
+/* { dg-final { scan-assembler "msa_bset_d.*:.*bset\\.d.*msa_bset_d" } } */
+/* { dg-final { scan-assembler "msa_bseti_b.*:.*bseti\\.b.*msa_bseti_b" } } */
+/* { dg-final { scan-assembler "msa_bseti_h.*:.*bseti\\.h.*msa_bseti_h" } } */
+/* { dg-final { scan-assembler "msa_bseti_w.*:.*bseti\\.w.*msa_bseti_w" } } */
+/* { dg-final { scan-assembler "msa_bseti_d.*:.*bseti\\.d.*msa_bseti_d" } } */
+/* { dg-final { scan-assembler "msa_nloc_b.*:.*nloc\\.b.*msa_nloc_b" } } */
+/* { dg-final { scan-assembler "msa_nloc_h.*:.*nloc\\.h.*msa_nloc_h" } } */
+/* { dg-final { scan-assembler "msa_nloc_w.*:.*nloc\\.w.*msa_nloc_w" } } */
+/* { dg-final { scan-assembler "msa_nloc_d.*:.*nloc\\.d.*msa_nloc_d" } } */
+/* { dg-final { scan-assembler "msa_nlzc_b.*:.*nlzc\\.b.*msa_nlzc_b" } } */
+/* { dg-final { scan-assembler "msa_nlzc_h.*:.*nlzc\\.h.*msa_nlzc_h" } } */
+/* { dg-final { scan-assembler "msa_nlzc_w.*:.*nlzc\\.w.*msa_nlzc_w" } } */
+/* { dg-final { scan-assembler "msa_nlzc_d.*:.*nlzc\\.d.*msa_nlzc_d" } } */
+/* { dg-final { scan-assembler "msa_nor_v.*:.*nor\\.v.*msa_nor_v" } } */
+/* { dg-final { scan-assembler "msa_nori_b.*:.*nori\\.b.*msa_nori_b" } } */
+/* { dg-final { scan-assembler "msa_pcnt_b.*:.*pcnt\\.b.*msa_pcnt_b" } } */
+/* { dg-final { scan-assembler "msa_pcnt_h.*:.*pcnt\\.h.*msa_pcnt_h" } } */
+/* { dg-final { scan-assembler "msa_pcnt_w.*:.*pcnt\\.w.*msa_pcnt_w" } } */
+/* { dg-final { scan-assembler "msa_pcnt_d.*:.*pcnt\\.d.*msa_pcnt_d" } } */
+/* { dg-final { scan-assembler "msa_or_v.*:.*or\\.v.*msa_or_v" } } */
+/* { dg-final { scan-assembler "msa_ori_b.*:.*ori\\.b.*msa_ori_b" } } */
+/* { dg-final { scan-assembler "msa_xor_v.*:.*xor\\.v.*msa_xor_v" } } */
+/* { dg-final { scan-assembler "msa_xori_b.*:.*xori\\.b.*msa_xori_b" } } */
+/* { dg-final { scan-assembler "msa_sll_b.*:.*sll\\.b.*msa_sll_b" } } */
+/* { dg-final { scan-assembler "msa_sll_h.*:.*sll\\.h.*msa_sll_h" } } */
+/* { dg-final { scan-assembler "msa_sll_w.*:.*sll\\.w.*msa_sll_w" } } */
+/* { dg-final { scan-assembler "msa_sll_d.*:.*sll\\.d.*msa_sll_d" } } */
+/* { dg-final { scan-assembler "msa_slli_b.*:.*slli\\.b.*msa_slli_b" } } */
+/* { dg-final { scan-assembler "msa_slli_h.*:.*slli\\.h.*msa_slli_h" } } */
+/* { dg-final { scan-assembler "msa_slli_w.*:.*slli\\.w.*msa_slli_w" } } */
+/* { dg-final { scan-assembler "msa_slli_d.*:.*slli\\.d.*msa_slli_d" } } */
+/* { dg-final { scan-assembler "msa_sra_b.*:.*sra\\.b.*msa_sra_b" } } */
+/* { dg-final { scan-assembler "msa_sra_h.*:.*sra\\.h.*msa_sra_h" } } */
+/* { dg-final { scan-assembler "msa_sra_w.*:.*sra\\.w.*msa_sra_w" } } */
+/* { dg-final { scan-assembler "msa_sra_d.*:.*sra\\.d.*msa_sra_d" } } */
+/* { dg-final { scan-assembler "msa_srai_b.*:.*srai\\.b.*msa_srai_b" } } */
+/* { dg-final { scan-assembler "msa_srai_h.*:.*srai\\.h.*msa_srai_h" } } */
+/* { dg-final { scan-assembler "msa_srai_w.*:.*srai\\.w.*msa_srai_w" } } */
+/* { dg-final { scan-assembler "msa_srai_d.*:.*srai\\.d.*msa_srai_d" } } */
+/* { dg-final { scan-assembler "msa_srar_b.*:.*srar\\.b.*msa_srar_b" } } */
+/* { dg-final { scan-assembler "msa_srar_h.*:.*srar\\.h.*msa_srar_h" } } */
+/* { dg-final { scan-assembler "msa_srar_w.*:.*srar\\.w.*msa_srar_w" } } */
+/* { dg-final { scan-assembler "msa_srar_d.*:.*srar\\.d.*msa_srar_d" } } */
+/* { dg-final { scan-assembler "msa_srari_b.*:.*srari\\.b.*msa_srari_b" } } */
+/* { dg-final { scan-assembler "msa_srari_h.*:.*srari\\.h.*msa_srari_h" } } */
+/* { dg-final { scan-assembler "msa_srari_w.*:.*srari\\.w.*msa_srari_w" } } */
+/* { dg-final { scan-assembler "msa_srari_d.*:.*srari\\.d.*msa_srari_d" } } */
+/* { dg-final { scan-assembler "msa_srl_b.*:.*srl\\.b.*msa_srl_b" } } */
+/* { dg-final { scan-assembler "msa_srl_h.*:.*srl\\.h.*msa_srl_h" } } */
+/* { dg-final { scan-assembler "msa_srl_w.*:.*srl\\.w.*msa_srl_w" } } */
+/* { dg-final { scan-assembler "msa_srl_d.*:.*srl\\.d.*msa_srl_d" } } */
+/* { dg-final { scan-assembler "msa_srli_b.*:.*srli\\.b.*msa_srli_b" } } */
+/* { dg-final { scan-assembler "msa_srli_h.*:.*srli\\.h.*msa_srli_h" } } */
+/* { dg-final { scan-assembler "msa_srli_w.*:.*srli\\.w.*msa_srli_w" } } */
+/* { dg-final { scan-assembler "msa_srli_d.*:.*srli\\.d.*msa_srli_d" } } */
+/* { dg-final { scan-assembler "msa_srlr_b.*:.*srlr\\.b.*msa_srlr_b" } } */
+/* { dg-final { scan-assembler "msa_srlr_h.*:.*srlr\\.h.*msa_srlr_h" } } */
+/* { dg-final { scan-assembler "msa_srlr_w.*:.*srlr\\.w.*msa_srlr_w" } } */
+/* { dg-final { scan-assembler "msa_srlr_d.*:.*srlr\\.d.*msa_srlr_d" } } */
+/* { dg-final { scan-assembler "msa_srlri_b.*:.*srlri\\.b.*msa_srlri_b" } } */
+/* { dg-final { scan-assembler "msa_srlri_h.*:.*srlri\\.h.*msa_srlri_h" } } */
+/* { dg-final { scan-assembler "msa_srlri_w.*:.*srlri\\.w.*msa_srlri_w" } } */
+/* { dg-final { scan-assembler "msa_srlri_d.*:.*srlri\\.d.*msa_srlri_d" } } */
+/* { dg-final { scan-assembler "msa_fadd_w.*:.*fadd\\.w.*msa_fadd_w" } } */
+/* { dg-final { scan-assembler "msa_fadd_d.*:.*fadd\\.d.*msa_fadd_d" } } */
+/* { dg-final { scan-assembler "msa_fdiv_w.*:.*fdiv\\.w.*msa_fdiv_w" } } */
+/* { dg-final { scan-assembler "msa_fdiv_d.*:.*fdiv\\.d.*msa_fdiv_d" } } */
+/* { dg-final { scan-assembler "msa_fexp2_w.*:.*fexp2\\.w.*msa_fexp2_w" } } */
+/* { dg-final { scan-assembler "msa_fexp2_d.*:.*fexp2\\.d.*msa_fexp2_d" } } */
+/* { dg-final { scan-assembler "msa_flog2_w.*:.*flog2\\.w.*msa_flog2_w" } } */
+/* { dg-final { scan-assembler "msa_flog2_d.*:.*flog2\\.d.*msa_flog2_d" } } */
+/* { dg-final { scan-assembler "msa_fmadd_w.*:.*fmadd\\.w.*msa_fmadd_w" } } */
+/* { dg-final { scan-assembler "msa_fmadd_d.*:.*fmadd\\.d.*msa_fmadd_d" } } */
+/* { dg-final { scan-assembler "msa_fmsub_w.*:.*fmsub\\.w.*msa_fmsub_w" } } */
+/* { dg-final { scan-assembler "msa_fmsub_d.*:.*fmsub\\.d.*msa_fmsub_d" } } */
+/* { dg-final { scan-assembler "msa_fmax_w.*:.*fmax\\.w.*msa_fmax_w" } } */
+/* { dg-final { scan-assembler "msa_fmax_d.*:.*fmax\\.d.*msa_fmax_d" } } */
+/* { dg-final { scan-assembler "msa_fmin_w.*:.*fmin\\.w.*msa_fmin_w" } } */
+/* { dg-final { scan-assembler "msa_fmin_d.*:.*fmin\\.d.*msa_fmin_d" } } */
+/* { dg-final { scan-assembler "msa_fmax_a_w.*:.*fmax_a\\.w.*msa_fmax_a_w" } } */
+/* { dg-final { scan-assembler "msa_fmax_a_d.*:.*fmax_a\\.d.*msa_fmax_a_d" } } */
+/* { dg-final { scan-assembler "msa_fmin_a_w.*:.*fmin_a\\.w.*msa_fmin_a_w" } } */
+/* { dg-final { scan-assembler "msa_fmin_a_d.*:.*fmin_a\\.d.*msa_fmin_a_d" } } */
+/* { dg-final { scan-assembler "msa_fmul_w.*:.*fmul\\.w.*msa_fmul_w" } } */
+/* { dg-final { scan-assembler "msa_fmul_d.*:.*fmul\\.d.*msa_fmul_d" } } */
+/* { dg-final { scan-assembler "msa_frcp_w.*:.*frcp\\.w.*msa_frcp_w" } } */
+/* { dg-final { scan-assembler "msa_frcp_d.*:.*frcp\\.d.*msa_frcp_d" } } */
+/* { dg-final { scan-assembler "msa_frint_w.*:.*frint\\.w.*msa_frint_w" } } */
+/* { dg-final { scan-assembler "msa_frint_d.*:.*frint\\.d.*msa_frint_d" } } */
+/* { dg-final { scan-assembler "msa_frsqrt_w.*:.*frsqrt\\.w.*msa_frsqrt_w" } } */
+/* { dg-final { scan-assembler "msa_frsqrt_d.*:.*frsqrt\\.d.*msa_frsqrt_d" } } */
+/* { dg-final { scan-assembler "msa_fsqrt_w.*:.*fsqrt\\.w.*msa_fsqrt_w" } } */
+/* { dg-final { scan-assembler "msa_fsqrt_d.*:.*fsqrt\\.d.*msa_fsqrt_d" } } */
+/* { dg-final { scan-assembler "msa_fsub_w.*:.*fsub\\.w.*msa_fsub_w" } } */
+/* { dg-final { scan-assembler "msa_fsub_d.*:.*fsub\\.d.*msa_fsub_d" } } */
+/* { dg-final { scan-assembler "msa_fclass_w.*:.*fclass\\.w.*msa_fclass_w" } } */
+/* { dg-final { scan-assembler "msa_fclass_d.*:.*fclass\\.d.*msa_fclass_d" } } */
+/* { dg-final { scan-assembler "msa_fcaf_w.*:.*fcaf\\.w.*msa_fcaf_w" } } */
+/* { dg-final { scan-assembler "msa_fcaf_d.*:.*fcaf\\.d.*msa_fcaf_d" } } */
+/* { dg-final { scan-assembler "msa_fcun_w.*:.*fcun\\.w.*msa_fcun_w" } } */
+/* { dg-final { scan-assembler "msa_fcun_d.*:.*fcun\\.d.*msa_fcun_d" } } */
+/* { dg-final { scan-assembler "msa_fcor_w.*:.*fcor\\.w.*msa_fcor_w" } } */
+/* { dg-final { scan-assembler "msa_fcor_d.*:.*fcor\\.d.*msa_fcor_d" } } */
+/* { dg-final { scan-assembler "msa_fceq_w.*:.*fceq\\.w.*msa_fceq_w" } } */
+/* { dg-final { scan-assembler "msa_fceq_d.*:.*fceq\\.d.*msa_fceq_d" } } */
+/* { dg-final { scan-assembler "msa_fcune_w.*:.*fcune\\.w.*msa_fcune_w" } } */
+/* { dg-final { scan-assembler "msa_fcune_d.*:.*fcune\\.d.*msa_fcune_d" } } */
+/* { dg-final { scan-assembler "msa_fcueq_w.*:.*fcueq\\.w.*msa_fcueq_w" } } */
+/* { dg-final { scan-assembler "msa_fcueq_d.*:.*fcueq\\.d.*msa_fcueq_d" } } */
+/* { dg-final { scan-assembler "msa_fcne_w.*:.*fcne\\.w.*msa_fcne_w" } } */
+/* { dg-final { scan-assembler "msa_fcne_d.*:.*fcne\\.d.*msa_fcne_d" } } */
+/* { dg-final { scan-assembler "msa_fclt_w.*:.*fclt\\.w.*msa_fclt_w" } } */
+/* { dg-final { scan-assembler "msa_fclt_d.*:.*fclt\\.d.*msa_fclt_d" } } */
+/* { dg-final { scan-assembler "msa_fcult_w.*:.*fcult\\.w.*msa_fcult_w" } } */
+/* { dg-final { scan-assembler "msa_fcult_d.*:.*fcult\\.d.*msa_fcult_d" } } */
+/* { dg-final { scan-assembler "msa_fcle_w.*:.*fcle\\.w.*msa_fcle_w" } } */
+/* { dg-final { scan-assembler "msa_fcle_d.*:.*fcle\\.d.*msa_fcle_d" } } */
+/* { dg-final { scan-assembler "msa_fcule_w.*:.*fcule\\.w.*msa_fcule_w" } } */
+/* { dg-final { scan-assembler "msa_fcule_d.*:.*fcule\\.d.*msa_fcule_d" } } */
+/* { dg-final { scan-assembler "msa_fsaf_w.*:.*fsaf\\.w.*msa_fsaf_w" } } */
+/* { dg-final { scan-assembler "msa_fsaf_d.*:.*fsaf\\.d.*msa_fsaf_d" } } */
+/* { dg-final { scan-assembler "msa_fsun_w.*:.*fsun\\.w.*msa_fsun_w" } } */
+/* { dg-final { scan-assembler "msa_fsun_d.*:.*fsun\\.d.*msa_fsun_d" } } */
+/* { dg-final { scan-assembler "msa_fsor_w.*:.*fsor\\.w.*msa_fsor_w" } } */
+/* { dg-final { scan-assembler "msa_fsor_d.*:.*fsor\\.d.*msa_fsor_d" } } */
+/* { dg-final { scan-assembler "msa_fseq_w.*:.*fseq\\.w.*msa_fseq_w" } } */
+/* { dg-final { scan-assembler "msa_fseq_d.*:.*fseq\\.d.*msa_fseq_d" } } */
+/* { dg-final { scan-assembler "msa_fsune_w.*:.*fsune\\.w.*msa_fsune_w" } } */
+/* { dg-final { scan-assembler "msa_fsune_d.*:.*fsune\\.d.*msa_fsune_d" } } */
+/* { dg-final { scan-assembler "msa_fsueq_w.*:.*fsueq\\.w.*msa_fsueq_w" } } */
+/* { dg-final { scan-assembler "msa_fsueq_d.*:.*fsueq\\.d.*msa_fsueq_d" } } */
+/* { dg-final { scan-assembler "msa_fsne_w.*:.*fsne\\.w.*msa_fsne_w" } } */
+/* { dg-final { scan-assembler "msa_fsne_d.*:.*fsne\\.d.*msa_fsne_d" } } */
+/* { dg-final { scan-assembler "msa_fslt_w.*:.*fslt\\.w.*msa_fslt_w" } } */
+/* { dg-final { scan-assembler "msa_fslt_d.*:.*fslt\\.d.*msa_fslt_d" } } */
+/* { dg-final { scan-assembler "msa_fsult_w.*:.*fsult\\.w.*msa_fsult_w" } } */
+/* { dg-final { scan-assembler "msa_fsult_d.*:.*fsult\\.d.*msa_fsult_d" } } */
+/* { dg-final { scan-assembler "msa_fsle_w.*:.*fsle\\.w.*msa_fsle_w" } } */
+/* { dg-final { scan-assembler "msa_fsle_d.*:.*fsle\\.d.*msa_fsle_d" } } */
+/* { dg-final { scan-assembler "msa_fsule_w.*:.*fsule\\.w.*msa_fsule_w" } } */
+/* { dg-final { scan-assembler "msa_fsule_d.*:.*fsule\\.d.*msa_fsule_d" } } */
+/* { dg-final { scan-assembler "msa_fexupl_w.*:.*fexupl\\.w.*msa_fexupl_w" } } */
+/* { dg-final { scan-assembler "msa_fexupl_d.*:.*fexupl\\.d.*msa_fexupl_d" } } */
+/* { dg-final { scan-assembler "msa_fexupr_w.*:.*fexupr\\.w.*msa_fexupr_w" } } */
+/* { dg-final { scan-assembler "msa_fexupr_d.*:.*fexupr\\.d.*msa_fexupr_d" } } */
+/* { dg-final { scan-assembler "msa_fexdo_h.*:.*fexdo\\.h.*msa_fexdo_h" } } */
+/* { dg-final { scan-assembler "msa_fexdo_w.*:.*fexdo\\.w.*msa_fexdo_w" } } */
+/* { dg-final { scan-assembler "msa_ffint_s_w.*:.*ffint_s\\.w.*msa_ffint_s_w" } } */
+/* { dg-final { scan-assembler "msa_ffint_s_d.*:.*ffint_s\\.d.*msa_ffint_s_d" } } */
+/* { dg-final { scan-assembler "msa_ffint_u_w.*:.*ffint_u\\.w.*msa_ffint_u_w" } } */
+/* { dg-final { scan-assembler "msa_ffint_u_d.*:.*ffint_u\\.d.*msa_ffint_u_d" } } */
+/* { dg-final { scan-assembler "msa_ffql_w.*:.*ffql\\.w.*msa_ffql_w" } } */
+/* { dg-final { scan-assembler "msa_ffql_d.*:.*ffql\\.d.*msa_ffql_d" } } */
+/* { dg-final { scan-assembler "msa_ffqr_w.*:.*ffqr\\.w.*msa_ffqr_w" } } */
+/* { dg-final { scan-assembler "msa_ffqr_d.*:.*ffqr\\.d.*msa_ffqr_d" } } */
+/* { dg-final { scan-assembler "msa_ftint_s_w.*:.*ftint_s\\.w.*msa_ftint_s_w" } } */
+/* { dg-final { scan-assembler "msa_ftint_s_d.*:.*ftint_s\\.d.*msa_ftint_s_d" } } */
+/* { dg-final { scan-assembler "msa_ftint_u_w.*:.*ftint_u\\.w.*msa_ftint_u_w" } } */
+/* { dg-final { scan-assembler "msa_ftint_u_d.*:.*ftint_u\\.d.*msa_ftint_u_d" } } */
+/* { dg-final { scan-assembler "msa_ftrunc_s_w.*:.*ftrunc_s\\.w.*msa_ftrunc_s_w" } } */
+/* { dg-final { scan-assembler "msa_ftrunc_s_d.*:.*ftrunc_s\\.d.*msa_ftrunc_s_d" } } */
+/* { dg-final { scan-assembler "msa_ftrunc_u_w.*:.*ftrunc_u\\.w.*msa_ftrunc_u_w" } } */
+/* { dg-final { scan-assembler "msa_ftrunc_u_d.*:.*ftrunc_u\\.d.*msa_ftrunc_u_d" } } */
+/* { dg-final { scan-assembler "msa_ftq_h.*:.*ftq\\.h.*msa_ftq_h" } } */
+/* { dg-final { scan-assembler "msa_ftq_w.*:.*ftq\\.w.*msa_ftq_w" } } */
+/* { dg-final { scan-assembler "msa_madd_q_h.*:.*madd_q\\.h.*msa_madd_q_h" } } */
+/* { dg-final { scan-assembler "msa_madd_q_w.*:.*madd_q\\.w.*msa_madd_q_w" } } */
+/* { dg-final { scan-assembler "msa_maddr_q_h.*:.*maddr_q\\.h.*msa_maddr_q_h" } } */
+/* { dg-final { scan-assembler "msa_maddr_q_w.*:.*maddr_q\\.w.*msa_maddr_q_w" } } */
+/* { dg-final { scan-assembler "msa_msub_q_h.*:.*msub_q\\.h.*msa_msub_q_h" } } */
+/* { dg-final { scan-assembler "msa_msub_q_w.*:.*msub_q\\.w.*msa_msub_q_w" } } */
+/* { dg-final { scan-assembler "msa_msubr_q_h.*:.*msubr_q\\.h.*msa_msubr_q_h" } } */
+/* { dg-final { scan-assembler "msa_msubr_q_w.*:.*msubr_q\\.w.*msa_msubr_q_w" } } */
+/* { dg-final { scan-assembler "msa_mul_q_h.*:.*mul_q\\.h.*msa_mul_q_h" } } */
+/* { dg-final { scan-assembler "msa_mul_q_w.*:.*mul_q\\.w.*msa_mul_q_w" } } */
+/* { dg-final { scan-assembler "msa_mulr_q_h.*:.*mulr_q\\.h.*msa_mulr_q_h" } } */
+/* { dg-final { scan-assembler "msa_mulr_q_w.*:.*mulr_q\\.w.*msa_mulr_q_w" } } */
+/* { dg-final { scan-assembler "msa_ceq_b.*:.*ceq\\.b.*msa_ceq_b" } } */
+/* { dg-final { scan-assembler "msa_ceq_h.*:.*ceq\\.h.*msa_ceq_h" } } */
+/* { dg-final { scan-assembler "msa_ceq_w.*:.*ceq\\.w.*msa_ceq_w" } } */
+/* { dg-final { scan-assembler "msa_ceq_d.*:.*ceq\\.d.*msa_ceq_d" } } */
+/* { dg-final { scan-assembler "msa_ceqi_b.*:.*ceqi\\.b.*msa_ceqi_b" } } */
+/* { dg-final { scan-assembler "msa_ceqi_h.*:.*ceqi\\.h.*msa_ceqi_h" } } */
+/* { dg-final { scan-assembler "msa_ceqi_w.*:.*ceqi\\.w.*msa_ceqi_w" } } */
+/* { dg-final { scan-assembler "msa_ceqi_d.*:.*ceqi\\.d.*msa_ceqi_d" } } */
+/* { dg-final { scan-assembler "msa_cle_s_b.*:.*cle_s\\.b.*msa_cle_s_b" } } */
+/* { dg-final { scan-assembler "msa_cle_s_h.*:.*cle_s\\.h.*msa_cle_s_h" } } */
+/* { dg-final { scan-assembler "msa_cle_s_w.*:.*cle_s\\.w.*msa_cle_s_w" } } */
+/* { dg-final { scan-assembler "msa_cle_s_d.*:.*cle_s\\.d.*msa_cle_s_d" } } */
+/* { dg-final { scan-assembler "msa_clei_s_b.*:.*clei_s\\.b.*msa_clei_s_b" } } */
+/* { dg-final { scan-assembler "msa_clei_s_h.*:.*clei_s\\.h.*msa_clei_s_h" } } */
+/* { dg-final { scan-assembler "msa_clei_s_w.*:.*clei_s\\.w.*msa_clei_s_w" } } */
+/* { dg-final { scan-assembler "msa_clei_s_d.*:.*clei_s\\.d.*msa_clei_s_d" } } */
+/* { dg-final { scan-assembler "msa_cle_u_b.*:.*cle_u\\.b.*msa_cle_u_b" } } */
+/* { dg-final { scan-assembler "msa_cle_u_h.*:.*cle_u\\.h.*msa_cle_u_h" } } */
+/* { dg-final { scan-assembler "msa_cle_u_w.*:.*cle_u\\.w.*msa_cle_u_w" } } */
+/* { dg-final { scan-assembler "msa_cle_u_d.*:.*cle_u\\.d.*msa_cle_u_d" } } */
+/* { dg-final { scan-assembler "msa_clei_u_b.*:.*clei_u\\.b.*msa_clei_u_b" } } */
+/* { dg-final { scan-assembler "msa_clei_u_h.*:.*clei_u\\.h.*msa_clei_u_h" } } */
+/* { dg-final { scan-assembler "msa_clei_u_w.*:.*clei_u\\.w.*msa_clei_u_w" } } */
+/* { dg-final { scan-assembler "msa_clei_u_d.*:.*clei_u\\.d.*msa_clei_u_d" } } */
+/* { dg-final { scan-assembler "msa_clt_s_b.*:.*clt_s\\.b.*msa_clt_s_b" } } */
+/* { dg-final { scan-assembler "msa_clt_s_h.*:.*clt_s\\.h.*msa_clt_s_h" } } */
+/* { dg-final { scan-assembler "msa_clt_s_w.*:.*clt_s\\.w.*msa_clt_s_w" } } */
+/* { dg-final { scan-assembler "msa_clt_s_d.*:.*clt_s\\.d.*msa_clt_s_d" } } */
+/* { dg-final { scan-assembler "msa_clti_s_b.*:.*clti_s\\.b.*msa_clti_s_b" } } */
+/* { dg-final { scan-assembler "msa_clti_s_h.*:.*clti_s\\.h.*msa_clti_s_h" } } */
+/* { dg-final { scan-assembler "msa_clti_s_w.*:.*clti_s\\.w.*msa_clti_s_w" } } */
+/* { dg-final { scan-assembler "msa_clti_s_d.*:.*clti_s\\.d.*msa_clti_s_d" } } */
+/* { dg-final { scan-assembler "msa_clt_u_b.*:.*clt_u\\.b.*msa_clt_u_b" } } */
+/* { dg-final { scan-assembler "msa_clt_u_h.*:.*clt_u\\.h.*msa_clt_u_h" } } */
+/* { dg-final { scan-assembler "msa_clt_u_w.*:.*clt_u\\.w.*msa_clt_u_w" } } */
+/* { dg-final { scan-assembler "msa_clt_u_d.*:.*clt_u\\.d.*msa_clt_u_d" } } */
+/* { dg-final { scan-assembler "msa_clti_u_b.*:.*clti_u\\.b.*msa_clti_u_b" } } */
+/* { dg-final { scan-assembler "msa_clti_u_h.*:.*clti_u\\.h.*msa_clti_u_h" } } */
+/* { dg-final { scan-assembler "msa_clti_u_w.*:.*clti_u\\.w.*msa_clti_u_w" } } */
+/* { dg-final { scan-assembler "msa_clti_u_d.*:.*clti_u\\.d.*msa_clti_u_d" } } */
+/* { dg-final { scan-assembler "msa_bnz_v.*:.*bnz\\.v.*msa_bnz_v" } } */
+/* { dg-final { scan-assembler "msa_bz_v.*:.*bz\\.v.*msa_bz_v" } } */
+/* { dg-final { scan-assembler "msa_bnz_b.*:.*bnz\\.b.*msa_bnz_b" } } */
+/* { dg-final { scan-assembler "msa_bnz_h.*:.*bnz\\.h.*msa_bnz_h" } } */
+/* { dg-final { scan-assembler "msa_bnz_w.*:.*bnz\\.w.*msa_bnz_w" } } */
+/* { dg-final { scan-assembler "msa_bnz_d.*:.*bnz\\.d.*msa_bnz_d" } } */
+/* { dg-final { scan-assembler "msa_bz_b.*:.*bz\\.b.*msa_bz_b" } } */
+/* { dg-final { scan-assembler "msa_bz_h.*:.*bz\\.h.*msa_bz_h" } } */
+/* { dg-final { scan-assembler "msa_bz_w.*:.*bz\\.w.*msa_bz_w" } } */
+/* { dg-final { scan-assembler "msa_bz_d.*:.*bz\\.d.*msa_bz_d" } } */
+/* { dg-final { scan-assembler "msa_cfcmsa.*:.*cfcmsa.*msa_cfcmsa" } } */
+/* { dg-final { scan-assembler "msa_ctcmsa.*:.*ctcmsa.*msa_ctcmsa" } } */
+/* { dg-final { scan-assembler "msa_ld_b.*:.*ld\\.b.*msa_ld_b" } } */
+/* { dg-final { scan-assembler "msa_ld_h.*:.*ld\\.h.*msa_ld_h" } } */
+/* { dg-final { scan-assembler "msa_ld_w.*:.*ld\\.w.*msa_ld_w" } } */
+/* { dg-final { scan-assembler "msa_ld_d.*:.*ld\\.d.*msa_ld_d" } } */
+/* { dg-final { scan-assembler "msa_ldi_b.*:.*ldi\\.b.*msa_ldi_b" } } */
+/* { dg-final { scan-assembler "msa_ldi_h.*:.*ldi\\.h.*msa_ldi_h" } } */
+/* { dg-final { scan-assembler "msa_ldi_w.*:.*ldi\\.w.*msa_ldi_w" } } */
+/* { dg-final { scan-assembler "msa_ldi_d.*:.*ldi\\.d.*msa_ldi_d" } } */
+/* Note: move.v is likely to be optimised out.  */
+/* { dg-final { scan-assembler "msa_move_v.*:.*\(move\\.v\)?.*msa_move_v" } } */
+/* { dg-final { scan-assembler "msa_splat_b.*:.*splat\\.b.*msa_splat_b" } } */
+/* { dg-final { scan-assembler "msa_splat_h.*:.*splat\\.h.*msa_splat_h" } } */
+/* { dg-final { scan-assembler "msa_splat_w.*:.*splat\\.w.*msa_splat_w" } } */
+/* { dg-final { scan-assembler "msa_splat_d.*:.*splat\\.d.*msa_splat_d" } } */
+/* { dg-final { scan-assembler "msa_splati_b.*:.*splati\\.b.*msa_splati_b" } } */
+/* { dg-final { scan-assembler "msa_splati_h.*:.*splati\\.h.*msa_splati_h" } } */
+/* { dg-final { scan-assembler "msa_splati_w.*:.*splati\\.w.*msa_splati_w" } } */
+/* { dg-final { scan-assembler "msa_splati_d.*:.*splati\\.d.*msa_splati_d" } } */
+/* { dg-final { scan-assembler "msa_fill_b.*:.*fill\\.b.*msa_fill_b" } } */
+/* { dg-final { scan-assembler "msa_fill_h.*:.*fill\\.h.*msa_fill_h" } } */
+/* { dg-final { scan-assembler "msa_fill_w.*:.*fill\\.w.*msa_fill_w" } } */
+/* Note: some instructions are only available on MIPS64, thus, these will be
+   replaced with equivalent ones on MIPS32.  */
+/* { dg-final { scan-assembler "msa_fill_d.*:.*fill\\.d.*msa_fill_d" { target mips64 } } } */
+/* { dg-final { scan-assembler "msa_fill_d.*:.*fill\\.w.*\(insert.w.*\)\{2\}.*msa_fill_d" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "msa_insert_b.*:.*insert\\.b.*msa_insert_b" } } */
+/* { dg-final { scan-assembler "msa_insert_h.*:.*insert\\.h.*msa_insert_h" } } */
+/* { dg-final { scan-assembler "msa_insert_w.*:.*insert\\.w.*msa_insert_w" } } */
+/* { dg-final { scan-assembler "msa_insert_d.*:.*insert\\.d.*msa_insert_d" { target mips64 } } } */
+/* { dg-final { scan-assembler "msa_insert_d.*:.*sra.*\(insert.w.*\)\{2\}.*msa_insert_d" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "msa_insve_b.*:.*insve\\.b.*msa_insve_b" } } */
+/* { dg-final { scan-assembler "msa_insve_h.*:.*insve\\.h.*msa_insve_h" } } */
+/* { dg-final { scan-assembler "msa_insve_w.*:.*insve\\.w.*msa_insve_w" } } */
+/* { dg-final { scan-assembler "msa_insve_d.*:.*insve\\.d.*msa_insve_d" } } */
+/* { dg-final { scan-assembler "msa_copy_s_b.*:.*copy_s\\.b.*msa_copy_s_b" } } */
+/* { dg-final { scan-assembler "msa_copy_s_h.*:.*copy_s\\.h.*msa_copy_s_h" } } */
+/* { dg-final { scan-assembler "msa_copy_s_w.*:.*copy_s\\.w.*msa_copy_s_w" } } */
+/* { dg-final { scan-assembler "msa_copy_s_d.*:.*copy_s\\.d.*msa_copy_s_d" { target mips64 } } } */
+/* { dg-final { scan-assembler "msa_copy_s_d.*:.*\(copy_s\\.w.*\)\{2\}.*msa_copy_s_d" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "msa_copy_u_b.*:.*copy_u\\.b.*msa_copy_u_b" } } */
+/* { dg-final { scan-assembler "msa_copy_u_h.*:.*copy_u\\.h.*msa_copy_u_h" } } */
+/* { dg-final { scan-assembler "msa_copy_u_w.*:.*copy_u\\.w.*msa_copy_u_w" } } */
+/* { dg-final { scan-assembler "msa_copy_u_d.*:.*copy_u\\.d.*msa_copy_u_d" { target mips64 } } } */
+/* { dg-final { scan-assembler "msa_copy_u_d.*:.*\(copy_u\\.w.*\)\{2\}.*msa_copy_u_d" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "msa_st_b.*:.*st\\.b.*msa_st_b" } } */
+/* { dg-final { scan-assembler "msa_st_h.*:.*st\\.h.*msa_st_h" } } */
+/* { dg-final { scan-assembler "msa_st_w.*:.*st\\.w.*msa_st_w" } } */
+/* { dg-final { scan-assembler "msa_st_d.*:.*st\\.d.*msa_st_d" } } */
+/* { dg-final { scan-assembler "msa_ilvev_b.*:.*ilvev\\.b.*msa_ilvev_b" } } */
+/* { dg-final { scan-assembler "msa_ilvev_h.*:.*ilvev\\.h.*msa_ilvev_h" } } */
+/* { dg-final { scan-assembler "msa_ilvev_w.*:.*ilvev\\.w.*msa_ilvev_w" } } */
+/* Note: ilvev.d is equivalent to ilvr.d.  */
+/* { dg-final { scan-assembler "msa_ilvev_d.*:.*\(ilvev|ilvr\)\\.d.*msa_ilvev_d" } } */
+/* { dg-final { scan-assembler "msa_ilvod_b.*:.*ilvod\\.b.*msa_ilvod_b" } } */
+/* { dg-final { scan-assembler "msa_ilvod_h.*:.*ilvod\\.h.*msa_ilvod_h" } } */
+/* { dg-final { scan-assembler "msa_ilvod_w.*:.*ilvod\\.w.*msa_ilvod_w" } } */
+/* Note: ilvod.d is equivalent to ilvl.d.  */
+/* { dg-final { scan-assembler "msa_ilvod_d.*:.*\(ilvod|ilvl\)\\.d.*msa_ilvod_d" } } */
+/* { dg-final { scan-assembler "msa_ilvl_b.*:.*ilvl\\.b.*msa_ilvl_b" } } */
+/* { dg-final { scan-assembler "msa_ilvl_h.*:.*ilvl\\.h.*msa_ilvl_h" } } */
+/* { dg-final { scan-assembler "msa_ilvl_w.*:.*ilvl\\.w.*msa_ilvl_w" } } */
+/* { dg-final { scan-assembler "msa_ilvl_d.*:.*ilvl\\.d.*msa_ilvl_d" } } */
+/* { dg-final { scan-assembler "msa_ilvr_b.*:.*ilvr\\.b.*msa_ilvr_b" } } */
+/* { dg-final { scan-assembler "msa_ilvr_h.*:.*ilvr\\.h.*msa_ilvr_h" } } */
+/* { dg-final { scan-assembler "msa_ilvr_w.*:.*ilvr\\.w.*msa_ilvr_w" } } */
+/* { dg-final { scan-assembler "msa_ilvr_d.*:.*ilvr\\.d.*msa_ilvr_d" } } */
+/* { dg-final { scan-assembler "msa_pckev_b.*:.*pckev\\.b.*msa_pckev_b" } } */
+/* { dg-final { scan-assembler "msa_pckev_h.*:.*pckev\\.h.*msa_pckev_h" } } */
+/* { dg-final { scan-assembler "msa_pckev_w.*:.*pckev\\.w.*msa_pckev_w" } } */
+/* { dg-final { scan-assembler "msa_pckev_d.*:.*pckev\\.d.*msa_pckev_d" } } */
+/* { dg-final { scan-assembler "msa_pckod_b.*:.*pckod\\.b.*msa_pckod_b" } } */
+/* { dg-final { scan-assembler "msa_pckod_h.*:.*pckod\\.h.*msa_pckod_h" } } */
+/* { dg-final { scan-assembler "msa_pckod_w.*:.*pckod\\.w.*msa_pckod_w" } } */
+/* { dg-final { scan-assembler "msa_pckod_d.*:.*pckod\\.d.*msa_pckod_d" } } */
+/* { dg-final { scan-assembler "msa_shf_b.*:.*shf\\.b.*msa_shf_b" } } */
+/* { dg-final { scan-assembler "msa_shf_h.*:.*shf\\.h.*msa_shf_h" } } */
+/* { dg-final { scan-assembler "msa_shf_w.*:.*shf\\.w.*msa_shf_w" } } */
+/* { dg-final { scan-assembler "msa_sld_b.*:.*sld\\.b.*msa_sld_b" } } */
+/* { dg-final { scan-assembler "msa_sld_h.*:.*sld\\.h.*msa_sld_h" } } */
+/* { dg-final { scan-assembler "msa_sld_w.*:.*sld\\.w.*msa_sld_w" } } */
+/* { dg-final { scan-assembler "msa_sld_d.*:.*sld\\.d.*msa_sld_d" } } */
+/* { dg-final { scan-assembler "msa_sldi_b.*:.*sldi\\.b.*msa_sldi_b" } } */
+/* { dg-final { scan-assembler "msa_sldi_h.*:.*sldi\\.h.*msa_sldi_h" } } */
+/* { dg-final { scan-assembler "msa_sldi_w.*:.*sldi\\.w.*msa_sldi_w" } } */
+/* { dg-final { scan-assembler "msa_sldi_d.*:.*sldi\\.d.*msa_sldi_d" } } */
+/* { dg-final { scan-assembler "msa_vshf_b.*:.*vshf\\.b.*msa_vshf_b" } } */
+/* { dg-final { scan-assembler "msa_vshf_h.*:.*vshf\\.h.*msa_vshf_h" } } */
+/* { dg-final { scan-assembler "msa_vshf_w.*:.*vshf\\.w.*msa_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_vshf_d.*:.*vshf\\.d.*msa_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_s_vshf_b.*:.*vshf.b.*msa_gcc_1_s_vshf_b" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_s_vshf_h.*:.*vshf.h.*msa_gcc_1_s_vshf_h" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_s_vshf_w.*:.*vshf.w.*msa_gcc_1_s_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_s_vshf_d.*:.*vshf.d.*msa_gcc_1_s_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_u_vshf_b.*:.*vshf.b.*msa_gcc_1_u_vshf_b" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_u_vshf_h.*:.*vshf.h.*msa_gcc_1_u_vshf_h" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_u_vshf_w.*:.*vshf.w.*msa_gcc_1_u_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_1_u_vshf_d.*:.*vshf.d.*msa_gcc_1_u_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_s_vshf_b.*:.*vshf.b.*msa_gcc_2_s_vshf_b" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_s_vshf_h.*:.*vshf.h.*msa_gcc_2_s_vshf_h" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_s_vshf_w.*:.*vshf.w.*msa_gcc_2_s_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_s_vshf_d.*:.*vshf.d.*msa_gcc_2_s_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_u_vshf_b.*:.*vshf.b.*msa_gcc_2_u_vshf_b" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_u_vshf_h.*:.*vshf.h.*msa_gcc_2_u_vshf_h" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_u_vshf_w.*:.*vshf.w.*msa_gcc_2_u_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_2_u_vshf_d.*:.*vshf.d.*msa_gcc_2_u_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_3_vshf_w.*:.*vshf.w.*msa_gcc_3_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_3_vshf_d.*:.*vshf.d.*msa_gcc_3_vshf_d" } } */
+/* { dg-final { scan-assembler "msa_gcc_4_vshf_w.*:.*vshf.w.*msa_gcc_4_vshf_w" } } */
+/* { dg-final { scan-assembler "msa_gcc_4_vshf_d.*:.*vshf.d.*msa_gcc_4_vshf_d" } } */
+
+#include <msa.h>
+
+#define U5MAX 31
+#define U8MAX 255
+#define S5MAX 15
+
+#define v16i8_DF b
+#define v8i16_DF h
+#define v4i32_DF w
+#define v2i64_DF d
+#define v16u8_DF b
+#define v8u16_DF h
+#define v4u32_DF w
+#define v2u64_DF d
+#define v4f32_DF w
+#define v2f64_DF d
+
+#define v16i8_DBL v8i16
+#define v8i16_DBL v4i32
+#define v4i32_DBL v2i64
+#define v16u8_DBL v8u16
+#define v8u16_DBL v4u32
+#define v4u32_DBL v2u64
+
+#define v16i8_DDF h
+#define v8i16_DDF w
+#define v4i32_DDF d
+#define v16u8_DDF h
+#define v8u16_DDF w
+#define v4u32_DDF d
+
+#define v4f32_HDF h
+#define v2f64_HDF w
+
+/* Signed twice the size result.  */
+#define v16u8_SDBL v8i16
+#define v8u16_SDBL v4i32
+#define v4u32_SDBL v2i64
+
+/* Signed values for unsigned type, subsus_u_* instructions.  */
+#define v16u8_S v16i8
+#define v8u16_S v8i16
+#define v4u32_S v4i32
+#define v2u64_S v2i64
+
+/* Integer elements for fexp2.  */
+#define v4f32_FEXP2 v4i32
+#define v2f64_FEXP2 v2i64
+
+/* Return type for floating-point conversion instructions.  */
+#define v4f32_FCNV v8i16
+#define v2f64_FCNV v4f32
+#define v4f32_FSINT v4i32
+#define v2f64_FSINT v2i64
+#define v4f32_FUINT v4u32
+#define v2f64_FUINT v2u64
+#define v4f32_FFP v8i16
+#define v2f64_FFP v4i32
+
+/* Integer result for floating point operations.  */
+#define v4f32_FRES v4i32
+#define v2f64_FRES v2i64
+
+/* Return type for compare unsign instructions.  */
+#define v16u8_CMP v16i8
+#define v8u16_CMP v8i16
+#define v4u32_CMP v4i32
+#define v2u64_CMP v2i64
+
+#define PASTE_BUILTIN(NAME, DF) __builtin_msa_ ## NAME ## _ ## DF
+#define EVAL_BUILTIN(NAME, DF) PASTE_BUILTIN(NAME, DF)
+#define BUILTIN(NAME, DF) EVAL_BUILTIN(NAME, DF)
+
+#define FN_EVAL(NAME, T) msa_ ## NAME ## _ ## T
+#define FN(NAME, T) FN_EVAL(NAME, T)
+
+/* MSA Arithmetic builtins.  */
+#define ADDV(T) NOMIPS16 T FN(addv, T ## _DF) (T i, T j) { return BUILTIN(addv, T ## _DF) (i, j); }
+#define ADDVI(T) NOMIPS16 T FN(addvi, T ## _DF) (T i) { return BUILTIN(addvi, T ## _DF) (i, U5MAX); }
+#define ADD_A(T) NOMIPS16 T FN(add_a, T ## _DF) (T i, T j) { return BUILTIN(add_a, T ## _DF) (i, j); }
+#define ADDS_A(T) NOMIPS16 T FN(adds_a, T ## _DF) (T i, T j) { return BUILTIN(adds_a, T ## _DF) (i, j); }
+#define ADDS_S(T) NOMIPS16 T FN(adds_s, T ## _DF) (T i, T j) { return BUILTIN(adds_s, T ## _DF) (i, j); }
+#define ADDS_U(T) NOMIPS16 T FN(adds_u, T ## _DF) (T i, T j) { return BUILTIN(adds_u, T ## _DF) (i, j); }
+#define HADD_S(T) NOMIPS16 T ## _DBL FN(hadd_s, T ## _DDF) (T i, T j) { return BUILTIN(hadd_s, T ## _DDF) (i, j); }
+#define HADD_U(T) NOMIPS16 T ## _DBL FN(hadd_u, T ## _DDF) (T i, T j) { return BUILTIN(hadd_u, T ## _DDF) (i, j); }
+#define ASUB_S(T) NOMIPS16 T FN(asub_s, T ## _DF) (T i, T j) { return BUILTIN(asub_s, T ## _DF) (i, j); }
+#define ASUB_U(T) NOMIPS16 T FN(asub_u, T ## _DF) (T i, T j) { return BUILTIN(asub_u, T ## _DF) (i, j); }
+#define AVE_S(T) NOMIPS16 T FN(ave_s, T ## _DF) (T i, T j) { return BUILTIN(ave_s, T ## _DF) (i, j); }
+#define AVE_U(T) NOMIPS16 T FN(ave_u, T ## _DF) (T i, T j) { return BUILTIN(ave_u, T ## _DF) (i, j); }
+#define AVER_S(T) NOMIPS16 T FN(aver_s, T ## _DF) (T i, T j) { return BUILTIN(aver_s, T ## _DF) (i, j); }
+#define AVER_U(T) NOMIPS16 T FN(aver_u, T ## _DF) (T i, T j) { return BUILTIN(aver_u, T ## _DF) (i, j); }
+#define DOTP_S(T) NOMIPS16 T ## _DBL FN(dotp_s, T ## _DDF) (T i, T j) { return BUILTIN(dotp_s, T ## _DDF) (i, j); }
+#define DOTP_U(T) NOMIPS16 T ## _DBL FN(dotp_u, T ## _DDF) (T i, T j) { return BUILTIN(dotp_u, T ## _DDF) (i, j); }
+#define DPADD_S(T) NOMIPS16 T ## _DBL FN(dpadd_s, T ## _DDF) (T ## _DBL i, T j, T k) { return BUILTIN(dpadd_s, T ## _DDF) (i, j, k); }
+#define DPADD_U(T) NOMIPS16 T ## _DBL FN(dpadd_u, T ## _DDF) (T ## _DBL i, T j, T k) { return BUILTIN(dpadd_u, T ## _DDF) (i, j, k); }
+#define DPSUB_S(T) NOMIPS16 T ## _DBL FN(dpsub_s, T ## _DDF) (T ## _DBL i, T j, T k) { return BUILTIN(dpsub_s, T ## _DDF) (i, j, k); }
+#define DPSUB_U(T) NOMIPS16 T ## _SDBL FN(dpsub_u, T ## _DDF) (T ## _SDBL i, T j, T k) { return BUILTIN(dpsub_u, T ## _DDF) (i, j, k); }
+#define DIV_S(T) NOMIPS16 T FN(div_s, T ## _DF) (T i, T j) { return BUILTIN(div_s, T ## _DF) (i, j); }
+#define DIV_U(T) NOMIPS16 T FN(div_u, T ## _DF) (T i, T j) { return BUILTIN(div_u, T ## _DF) (i, j); }
+#define MADDV(T) NOMIPS16 T FN(maddv, T ## _DF) (T i, T j, T k) { return BUILTIN(maddv, T ## _DF) (i, j, k); }
+#define MAX_A(T) NOMIPS16 T FN(max_a, T ## _DF) (T i, T j) { return BUILTIN(max_a, T ## _DF) (i, j); }
+#define MIN_A(T) NOMIPS16 T FN(min_a, T ## _DF) (T i, T j) { return BUILTIN(min_a, T ## _DF) (i, j); }
+#define MAX_S(T) NOMIPS16 T FN(max_s, T ## _DF) (T i, T j) { return BUILTIN(max_s, T ## _DF) (i, j); }
+#define MAXI_S(T) NOMIPS16 T FN(maxi_s, T ## _DF) (T i) { return BUILTIN(maxi_s, T ## _DF) (i, S5MAX); }
+#define MAX_U(T) NOMIPS16 T FN(max_u, T ## _DF) (T i, T j) { return BUILTIN(max_u, T ## _DF) (i, j); }
+#define MAXI_U(T) NOMIPS16 T FN(maxi_u, T ## _DF) (T i) { return BUILTIN(maxi_u, T ## _DF) (i, S5MAX); }
+#define MIN_S(T) NOMIPS16 T FN(min_s, T ## _DF) (T i, T j) { return BUILTIN(min_s, T ## _DF) (i, j); }
+#define MINI_S(T) NOMIPS16 T FN(mini_s, T ## _DF) (T i) { return BUILTIN(mini_s, T ## _DF) (i, S5MAX); }
+#define MIN_U(T) NOMIPS16 T FN(min_u, T ## _DF) (T i, T j) { return BUILTIN(min_u, T ## _DF) (i, j); }
+#define MINI_U(T) NOMIPS16 T FN(mini_u, T ## _DF) (T i) { return BUILTIN(mini_u, T ## _DF) (i, S5MAX); }
+#define MSUBV(T) NOMIPS16 T FN(msubv, T ## _DF) (T i, T j, T k) { return BUILTIN(msubv, T ## _DF) (i, j, k); }
+#define MULV(T) NOMIPS16 T FN(mulv, T ## _DF) (T i, T j) { return BUILTIN(mulv, T ## _DF) (i, j); }
+#define MOD_S(T) NOMIPS16 T FN(mod_s, T ## _DF) (T i, T j) { return BUILTIN(mod_s, T ## _DF) (i, j); }
+#define MOD_U(T) NOMIPS16 T FN(mod_u, T ## _DF) (T i, T j) { return BUILTIN(mod_u, T ## _DF) (i, j); }
+#define SAT_S(T) NOMIPS16 T FN(sat_s, T ## _DF) (T i) { return BUILTIN(sat_s, T ## _DF) (i, 7); }
+#define SAT_U(T) NOMIPS16 T FN(sat_u, T ## _DF) (T i) { return BUILTIN(sat_u, T ## _DF) (i, 7); }
+#define SUBS_S(T) NOMIPS16 T FN(subs_s, T ## _DF) (T i, T j) { return BUILTIN(subs_s, T ## _DF) (i, j); }
+#define SUBS_U(T) NOMIPS16 T FN(subs_u, T ## _DF) (T i, T j) { return BUILTIN(subs_u, T ## _DF) (i, j); }
+#define HSUB_S(T) NOMIPS16 T ## _DBL FN(hsub_s, T ## _DDF) (T i, T j) { return BUILTIN(hsub_s, T ## _DDF) (i, j); }
+#define HSUB_U(T) NOMIPS16 T ## _SDBL FN(hsub_u, T ## _DDF) (T i, T j) { return BUILTIN(hsub_u, T ## _DDF) (i, j); }
+#define SUBSUU_S(T) NOMIPS16 T ## _S FN(subsuu_s, T ## _DF) (T i, T j) { return BUILTIN(subsuu_s, T ## _DF) (i, j); }
+#define SUBSUS_U(T) NOMIPS16 T FN(subsus_u, T ## _DF) (T i, T ## _S j) { return BUILTIN(subsus_u, T ## _DF) (i, j); }
+#define SUBV(T) NOMIPS16 T FN(subv, T ## _DF) (T i, T j) { return BUILTIN(subv, T ## _DF) (i, j); }
+#define SUBVI(T) NOMIPS16 T FN(subvi, T ## _DF) (T i) { return BUILTIN(subvi, T ## _DF) (i, U5MAX); }
+
+/* MSA Bitwise builtins.  */
+#define AND(T) NOMIPS16 T FN(and, v) (T i, T j) { return BUILTIN(and, v) (i, j); }
+#define ANDI(T) NOMIPS16 T FN(andi, T ## _DF) (T i) { return BUILTIN(andi, T ## _DF) (i, U8MAX); }
+#define BCLR(T) NOMIPS16 T FN(bclr, T ## _DF) (T i, T j) { return BUILTIN(bclr, T ## _DF) (i, j); }
+#define BCLRI(T) NOMIPS16 T FN(bclri, T ## _DF) (T i) { return BUILTIN(bclri, T ## _DF) (i, 0); }
+#define BINSL(T) NOMIPS16 T FN(binsl, T ## _DF) (T i, T j, T k) { return BUILTIN(binsl, T ## _DF) (i, j, k); }
+#define BINSLI(T) NOMIPS16 T FN(binsli, T ## _DF) (T i, T j) { return BUILTIN(binsli, T ## _DF) (i, j, 0); }
+#define BINSR(T) NOMIPS16 T FN(binsr, T ## _DF) (T i, T j, T k) { return BUILTIN(binsr, T ## _DF) (i, j, k); }
+#define BINSRI(T) NOMIPS16 T FN(binsri, T ## _DF) (T i, T j) { return BUILTIN(binsri, T ## _DF) (i, j, 0); }
+#define BMNZ(T) NOMIPS16 T FN(bmnz, v) (T i, T j, T k) { return BUILTIN(bmnz, v) (i, j, k); }
+#define BMNZI(T) NOMIPS16 T FN(bmnzi, T ## _DF) (T i, T j) { return BUILTIN(bmnzi, T ## _DF) (i, j, U8MAX); }
+#define BMZ(T) NOMIPS16 T FN(bmz, v) (T i, T j, T k) { return BUILTIN(bmz, v) (i, j, k); }
+#define BMZI(T) NOMIPS16 T FN(bmzi, T ## _DF) (T i, T j) { return BUILTIN(bmzi, T ## _DF) (i, j, U8MAX); }
+#define BNEG(T) NOMIPS16 T FN(bneg, T ## _DF) (T i, T j) { return BUILTIN(bneg, T ## _DF) (i, j); }
+#define BNEGI(T) NOMIPS16 T FN(bnegi, T ## _DF) (T i) { return BUILTIN(bnegi, T ## _DF) (i, 0); }
+#define BSEL(T) NOMIPS16 T FN(bsel, v) (T i, T j, T k) { return BUILTIN(bsel, v) (i, j, k); }
+#define BSELI(T) NOMIPS16 T FN(bseli, T ## _DF) (T i, T j) { return BUILTIN(bseli, T ## _DF) (i, j, U8MAX); }
+#define BSET(T) NOMIPS16 T FN(bset, T ## _DF) (T i, T j) { return BUILTIN(bset, T ## _DF) (i, j); }
+#define BSETI(T) NOMIPS16 T FN(bseti, T ## _DF) (T i) { return BUILTIN(bseti, T ## _DF) (i, 0); }
+#define NLOC(T) NOMIPS16 T FN(nloc, T ## _DF) (T i) { return BUILTIN(nloc, T ## _DF) (i); }
+#define NLZC(T) NOMIPS16 T FN(nlzc, T ## _DF) (T i) { return BUILTIN(nlzc, T ## _DF) (i); }
+#define NOR(T) NOMIPS16 T FN(nor, v) (T i, T j) { return BUILTIN(nor, v) (i, j); }
+#define NORI(T) NOMIPS16 T FN(nori, T ## _DF) (T i) { return BUILTIN(nori, T ## _DF) (i, U8MAX); }
+#define PCNT(T) NOMIPS16 T FN(pcnt, T ## _DF) (T i) { return BUILTIN(pcnt, T ## _DF) (i); }
+#define OR(T) NOMIPS16 T FN(or, v) (T i, T j) { return BUILTIN(or, v) (i, j); }
+#define ORI(T) NOMIPS16 T FN(ori, T ## _DF) (T i) { return BUILTIN(ori, T ## _DF) (i, U8MAX); }
+#define XOR(T) NOMIPS16 T FN(xor, v) (T i, T j) { return BUILTIN(xor, v) (i, j); }
+#define XORI(T) NOMIPS16 T FN(xori, T ## _DF) (T i) { return BUILTIN(xori, T ## _DF) (i, U8MAX); }
+#define SLL(T) NOMIPS16 T FN(sll, T ## _DF) (T i, T j) { return BUILTIN(sll, T ## _DF) (i, j); }
+#define SLLI(T) NOMIPS16 T FN(slli, T ## _DF) (T i) { return BUILTIN(slli, T ## _DF) (i, 0); }
+#define SRA(T) NOMIPS16 T FN(sra, T ## _DF) (T i, T j) { return BUILTIN(sra, T ## _DF) (i, j); }
+#define SRAI(T) NOMIPS16 T FN(srai, T ## _DF) (T i) { return BUILTIN(srai, T ## _DF) (i, 0); }
+#define SRAR(T) NOMIPS16 T FN(srar, T ## _DF) (T i, T j) { return BUILTIN(srar, T ## _DF) (i, j); }
+#define SRARI(T) NOMIPS16 T FN(srari, T ## _DF) (T i) { return BUILTIN(srari, T ## _DF) (i, 0); }
+#define SRL(T) NOMIPS16 T FN(srl, T ## _DF) (T i, T j) { return BUILTIN(srl, T ## _DF) (i, j); }
+#define SRLI(T) NOMIPS16 T FN(srli, T ## _DF) (T i) { return BUILTIN(srli, T ## _DF) (i, 0); }
+#define SRLR(T) NOMIPS16 T FN(srlr, T ## _DF) (T i, T j) { return BUILTIN(srlr, T ## _DF) (i, j); }
+#define SRLRI(T) NOMIPS16 T FN(srlri, T ## _DF) (T i) { return BUILTIN(srlri, T ## _DF) (i, 0); }
+
+/* MSA Floating-Point Arithmetic builtins.  */
+#define FADD(T) NOMIPS16 T FN(fadd, T ## _DF) (T i, T j) { return BUILTIN(fadd, T ## _DF) (i, j); }
+#define FDIV(T) NOMIPS16 T FN(fdiv, T ## _DF) (T i, T j) { return BUILTIN(fdiv, T ## _DF) (i, j); }
+#define FEXP2(T) NOMIPS16 T FN(fexp2, T ## _DF) (T i, T ## _FEXP2 j) { return BUILTIN(fexp2, T ## _DF) (i, j); }
+#define FLOG2(T) NOMIPS16 T FN(flog2, T ## _DF) (T i) { return BUILTIN(flog2, T ## _DF) (i); }
+#define FMADD(T) NOMIPS16 T FN(fmadd, T ## _DF) (T i, T j, T k) { return BUILTIN(fmadd, T ## _DF) (i, j, k); }
+#define FMSUB(T) NOMIPS16 T FN(fmsub, T ## _DF) (T i, T j, T k) { return BUILTIN(fmsub, T ## _DF) (i, j, k); }
+#define FMAX(T) NOMIPS16 T FN(fmax, T ## _DF) (T i, T j) { return BUILTIN(fmax, T ## _DF) (i, j); }
+#define FMIN(T) NOMIPS16 T FN(fmin, T ## _DF) (T i, T j) { return BUILTIN(fmin, T ## _DF) (i, j); }
+#define FMAX_A(T) NOMIPS16 T FN(fmax_a, T ## _DF) (T i, T j) { return BUILTIN(fmax_a, T ## _DF) (i, j); }
+#define FMIN_A(T) NOMIPS16 T FN(fmin_a, T ## _DF) (T i, T j) { return BUILTIN(fmin_a, T ## _DF) (i, j); }
+#define FMUL(T) NOMIPS16 T FN(fmul, T ## _DF) (T i, T j) { return BUILTIN(fmul, T ## _DF) (i, j); }
+#define FRCP(T) NOMIPS16 T FN(frcp, T ## _DF) (T i) { return BUILTIN(frcp, T ## _DF) (i); }
+#define FRINT(T) NOMIPS16 T FN(frint, T ## _DF) (T i) { return BUILTIN(frint, T ## _DF) (i); }
+#define FRSQRT(T) NOMIPS16 T FN(frsqrt, T ## _DF) (T i) { return BUILTIN(frsqrt, T ## _DF) (i); }
+#define FSQRT(T) NOMIPS16 T FN(fsqrt, T ## _DF) (T i) { return BUILTIN(fsqrt, T ## _DF) (i); }
+#define FSUB(T) NOMIPS16 T FN(fsub, T ## _DF) (T i, T j) { return BUILTIN(fsub, T ## _DF) (i, j); }
+
+/* MSA Floating-Point Compare builtins.  */
+#define FCLASS(T) NOMIPS16 T ## _FRES FN(fclass, T ## _DF) (T i) { return BUILTIN(fclass, T ## _DF) (i); }
+#define FCAF(T) NOMIPS16 T ## _FRES FN(fcaf, T ## _DF) (T i, T j) { return BUILTIN(fcaf, T ## _DF) (i, j); }
+#define FCUN(T) NOMIPS16 T ## _FRES FN(fcun, T ## _DF) (T i, T j) { return BUILTIN(fcun, T ## _DF) (i, j); }
+#define FCOR(T) NOMIPS16 T ## _FRES FN(fcor, T ## _DF) (T i, T j) { return BUILTIN(fcor, T ## _DF) (i, j); }
+#define FCEQ(T) NOMIPS16 T ## _FRES FN(fceq, T ## _DF) (T i, T j) { return BUILTIN(fceq, T ## _DF) (i, j); }
+#define FCUNE(T) NOMIPS16 T ## _FRES FN(fcune, T ## _DF) (T i, T j) { return BUILTIN(fcune, T ## _DF) (i, j); }
+#define FCUEQ(T) NOMIPS16 T ## _FRES FN(fcueq, T ## _DF) (T i, T j) { return BUILTIN(fcueq, T ## _DF) (i, j); }
+#define FCNE(T) NOMIPS16 T ## _FRES FN(fcne, T ## _DF) (T i, T j) { return BUILTIN(fcne, T ## _DF) (i, j); }
+#define FCLT(T) NOMIPS16 T ## _FRES FN(fclt, T ## _DF) (T i, T j) { return BUILTIN(fclt, T ## _DF) (i, j); }
+#define FCULT(T) NOMIPS16 T ## _FRES FN(fcult, T ## _DF) (T i, T j) { return BUILTIN(fcult, T ## _DF) (i, j); }
+#define FCLE(T) NOMIPS16 T ## _FRES FN(fcle, T ## _DF) (T i, T j) { return BUILTIN(fcle, T ## _DF) (i, j); }
+#define FCULE(T) NOMIPS16 T ## _FRES FN(fcule, T ## _DF) (T i, T j) { return BUILTIN(fcule, T ## _DF) (i, j); }
+#define FSAF(T) NOMIPS16 T ## _FRES FN(fsaf, T ## _DF) (T i, T j) { return BUILTIN(fsaf, T ## _DF) (i, j); }
+#define FSUN(T) NOMIPS16 T ## _FRES FN(fsun, T ## _DF) (T i, T j) { return BUILTIN(fsun, T ## _DF) (i, j); }
+#define FSOR(T) NOMIPS16 T ## _FRES FN(fsor, T ## _DF) (T i, T j) { return BUILTIN(fsor, T ## _DF) (i, j); }
+#define FSEQ(T) NOMIPS16 T ## _FRES FN(fseq, T ## _DF) (T i, T j) { return BUILTIN(fseq, T ## _DF) (i, j); }
+#define FSUNE(T) NOMIPS16 T ## _FRES FN(fsune, T ## _DF) (T i, T j) { return BUILTIN(fsune, T ## _DF) (i, j); }
+#define FSUEQ(T) NOMIPS16 T ## _FRES FN(fsueq, T ## _DF) (T i, T j) { return BUILTIN(fsueq, T ## _DF) (i, j); }
+#define FSNE(T) NOMIPS16 T ## _FRES FN(fsne, T ## _DF) (T i, T j) { return BUILTIN(fsne, T ## _DF) (i, j); }
+#define FSLT(T) NOMIPS16 T ## _FRES FN(fslt, T ## _DF) (T i, T j) { return BUILTIN(fslt, T ## _DF) (i, j); }
+#define FSULT(T) NOMIPS16 T ## _FRES FN(fsult, T ## _DF) (T i, T j) { return BUILTIN(fsult, T ## _DF) (i, j); }
+#define FSLE(T) NOMIPS16 T ## _FRES FN(fsle, T ## _DF) (T i, T j) { return BUILTIN(fsle, T ## _DF) (i, j); }
+#define FSULE(T) NOMIPS16 T ## _FRES FN(fsule, T ## _DF) (T i, T j) { return BUILTIN(fsule, T ## _DF) (i, j); }
+
+/* MSA Floating-Point Conversion builtins.  */
+#define FEXUPL(T) NOMIPS16 T FN(fexupl, T ## _DF) (T ## _FCNV i) { return BUILTIN(fexupl, T ## _DF) (i); }
+#define FEXUPR(T) NOMIPS16 T FN(fexupr, T ## _DF) (T ## _FCNV i) { return BUILTIN(fexupr, T ## _DF) (i); }
+#define FEXDO(T) NOMIPS16 T ## _FCNV FN(fexdo, T ## _HDF) (T i, T j) { return BUILTIN(fexdo, T ## _HDF) (i, j); }
+#define FFINT_S(T) NOMIPS16 T FN(ffint_s, T ## _DF) (T ## _FSINT i) { return BUILTIN(ffint_s, T ## _DF) (i); }
+#define FFINT_U(T) NOMIPS16 T FN(ffint_u, T ## _DF) (T ## _FUINT i) { return BUILTIN(ffint_u, T ## _DF) (i); }
+#define FFQL(T) NOMIPS16 T FN(ffql, T ## _DF) (T ## _FFP i) { return BUILTIN(ffql, T ## _DF) (i); }
+#define FFQR(T) NOMIPS16 T FN(ffqr, T ## _DF) (T ## _FFP i) { return BUILTIN(ffqr, T ## _DF) (i); }
+#define FTINT_S(T) NOMIPS16 T ## _FSINT FN(ftint_s, T ## _DF) (T i) { return BUILTIN(ftint_s, T ## _DF) (i); }
+#define FTINT_U(T) NOMIPS16 T ## _FUINT FN(ftint_u, T ## _DF) (T i) { return BUILTIN(ftint_u, T ## _DF) (i); }
+#define FTRUNC_S(T) NOMIPS16 T ## _FSINT FN(ftrunc_s, T ## _DF) (T i) { return BUILTIN(ftrunc_s, T ## _DF) (i); }
+#define FTRUNC_U(T) NOMIPS16 T ## _FUINT FN(ftrunc_u, T ## _DF) (T i) { return BUILTIN(ftrunc_u, T ## _DF) (i); }
+#define FTQ(T) NOMIPS16 T ## _FFP FN(ftq, T ## _HDF) (T i, T j) { return BUILTIN(ftq, T ## _HDF) (i, j); }
+
+/* MSA Fixed-Point Multiplication builtins.  */
+#define MADD_Q(T) NOMIPS16 T ## _FFP FN(madd_q, T ## _HDF) (T ## _FFP i, T ## _FFP j, T ## _FFP k) { return BUILTIN(madd_q, T ## _HDF) (i, j, k); }
+#define MADDR_Q(T) NOMIPS16 T ## _FFP FN(maddr_q, T ## _HDF) (T ## _FFP i, T ## _FFP j, T ## _FFP k) { return BUILTIN(maddr_q, T ## _HDF) (i, j, k); }
+#define MSUB_Q(T) NOMIPS16 T ## _FFP FN(msub_q, T ## _HDF) (T ## _FFP i, T ## _FFP j, T ## _FFP k) { return BUILTIN(msub_q, T ## _HDF) (i, j, k); }
+#define MSUBR_Q(T) NOMIPS16 T ## _FFP FN(msubr_q, T ## _HDF) (T ## _FFP i, T ## _FFP j, T ## _FFP k) { return BUILTIN(msubr_q, T ## _HDF) (i, j, k); }
+#define MUL_Q(T) NOMIPS16 T ## _FFP FN(mul_q, T ## _HDF) (T ## _FFP i, T ## _FFP j) { return BUILTIN(mul_q, T ## _HDF) (i, j); }
+#define MULR_Q(T) NOMIPS16 T ## _FFP FN(mulr_q, T ## _HDF) (T ## _FFP i, T ## _FFP j) { return BUILTIN(mulr_q, T ## _HDF) (i, j); }
+
+/* MSA Compare builtins.  */
+#define CEQ(T) NOMIPS16 T FN(ceq, T ## _DF) (T i, T j) { return BUILTIN(ceq, T ## _DF) (i, j); }
+#define CEQI(T) NOMIPS16 T FN(ceqi, T ## _DF) (T i) { return BUILTIN(ceqi, T ## _DF) (i, 0); }
+#define CLE_S(T) NOMIPS16 T FN(cle_s, T ## _DF) (T i, T j) { return BUILTIN(cle_s, T ## _DF) (i, j); }
+#define CLEI_S(T) NOMIPS16 T FN(clei_s, T ## _DF) (T i) { return BUILTIN(clei_s, T ## _DF) (i, 0); }
+#define CLE_U(T) NOMIPS16 T ## _CMP FN(cle_u, T ## _DF) (T i, T j) { return BUILTIN(cle_u, T ## _DF) (i, j); }
+#define CLEI_U(T) NOMIPS16 T ## _CMP FN(clei_u, T ## _DF) (T i) { return BUILTIN(clei_u, T ## _DF) (i, 10); }
+#define CLT_S(T) NOMIPS16 T FN(clt_s, T ## _DF) (T i, T j) { return BUILTIN(clt_s, T ## _DF) (i, j); }
+#define CLTI_S(T) NOMIPS16 T FN(clti_s, T ## _DF) (T i) { return BUILTIN(clti_s, T ## _DF) (i, 0); }
+#define CLT_U(T) NOMIPS16 T ## _CMP FN(clt_u, T ## _DF) (T i, T j) { return BUILTIN(clt_u, T ## _DF) (i, j); }
+#define CLTI_U(T) NOMIPS16 T ## _CMP FN(clti_u, T ## _DF) (T i) { return BUILTIN(clti_u, T ## _DF) (i, 0); }
+
+/* MSA Branch builtins.  */
+#define BNZV(T) NOMIPS16 int FN(bnz, v) (T i) { return BUILTIN(bnz, v) (i); }
+#define BZV(T) NOMIPS16 int FN(bz, v) (T i) { return BUILTIN(bz, v) (i); }
+#define BNZ(T) NOMIPS16 int FN(bnz, T ## _DF) (T i) { return BUILTIN(bnz, T ## _DF) (i); }
+#define BZ(T) NOMIPS16 int FN(bz, T ## _DF) (T i) { return BUILTIN(bz, T ## _DF) (i); }
+
+/* MSA Load/Store and Move builtins.  */
+#define CFCMSA() int msa_cfcmsa () { return __builtin_msa_cfcmsa(0x1f); }
+#define CTCMSA() void msa_ctcmsa (int i) { return __builtin_msa_ctcmsa(0x1f, i); }
+#define LD(T) T FN(ld, T ## _DF) (char *i) { return BUILTIN(ld, T ## _DF) (i, 0); }
+#define LDI(T) T FN(ldi, T ## _DF) () { return BUILTIN(ldi, T ## _DF) (123); }
+#define MOVE(T) NOMIPS16 T FN(move, v) (T i) { return BUILTIN(move, v) (i); }
+#define SPLAT(T) T FN(splat, T ## _DF) (T i, int j) { return BUILTIN(splat, T ## _DF) (i, j); }
+#define SPLATI(T) T FN(splati, T ## _DF) (T i) { return BUILTIN(splati, T ## _DF) (i, 1); }
+#define FILL(T) T FN(fill, T ## _DF) (int i) { return BUILTIN(fill, T ## _DF) (i); }
+#define INSERT(T) T FN(insert, T ## _DF) (T i, int j) { return BUILTIN(insert, T ## _DF) (i, 1, j); }
+#define INSVE(T) T FN(insve, T ## _DF) (T i, T j) { return BUILTIN(insve, T ## _DF) (i, 1, j); }
+#define COPY_S(T) int FN(copy_s, T ## _DF) (T i) { return BUILTIN(copy_s, T ## _DF) (i, 1); }
+#define COPY_S_D(T) long long FN(copy_s, T ## _DF) (T i) { return BUILTIN(copy_s, T ## _DF) (i, 1); }
+#define COPY_U(T) unsigned int FN(copy_u, T ## _DF) (T i) { return BUILTIN(copy_u, T ## _DF) (i, 1); }
+#define COPY_U_D(T) unsigned long long FN(copy_u, T ## _DF) (T i) { return BUILTIN(copy_u, T ## _DF) (i, 1); }
+#define ST(T) void FN(st, T ## _DF) (T i, char *j) { BUILTIN(st, T ## _DF) (i, j, -64); }
+
+/* MSA Element Permute builtins.  */
+#define ILVEV(T) NOMIPS16 T FN(ilvev, T ## _DF) (T i, T j) { return BUILTIN(ilvev, T ## _DF) (i, j); }
+#define ILVOD(T) NOMIPS16 T FN(ilvod, T ## _DF) (T i, T j) { return BUILTIN(ilvod, T ## _DF) (i, j); }
+#define ILVL(T) NOMIPS16 T FN(ilvl, T ## _DF) (T i, T j) { return BUILTIN(ilvl, T ## _DF) (i, j); }
+#define ILVR(T) NOMIPS16 T FN(ilvr, T ## _DF) (T i, T j) { return BUILTIN(ilvr, T ## _DF) (i, j); }
+#define PCKEV(T) NOMIPS16 T FN(pckev, T ## _DF) (T i, T j) { return BUILTIN(pckev, T ## _DF) (i, j); }
+#define PCKOD(T) NOMIPS16 T FN(pckod, T ## _DF) (T i, T j) { return BUILTIN(pckod, T ## _DF) (i, j); }
+#define SHF(T) NOMIPS16 T FN(shf, T ## _DF) (T i) { return BUILTIN(shf, T ## _DF) (i, 127); }
+#define SLD(T) NOMIPS16 T FN(sld, T ## _DF) (T i, T j, int k) { return BUILTIN(sld, T ## _DF) (i, j, k); }
+#define SLDI(T) NOMIPS16 T FN(sldi, T ## _DF) (T i, T j) { return BUILTIN(sldi, T ## _DF) (i, j, 1); }
+#define VSHF(T) NOMIPS16 T FN(vshf, T ## _DF) (T i, T j, T k) { return BUILTIN(vshf, T ## _DF) (i, j, k); }
+
+/* GCC builtins that generate MSA instructions.  */
+#define SHUFFLE1_S(T) T FN(gcc_1_s_vshf, T ## _DF) (T i, T mask) { return __builtin_shuffle (i, mask); }
+#define SHUFFLE1_U(T) T FN(gcc_1_u_vshf, T ## _DF) (T i, T mask) { return __builtin_shuffle (i, mask); }
+#define SHUFFLE2_S(T) T FN(gcc_2_s_vshf, T ## _DF) (T i, T j, T mask) { return __builtin_shuffle (i, j, mask); }
+#define SHUFFLE2_U(T) T FN(gcc_2_u_vshf, T ## _DF) (T i, T j, T mask) { return __builtin_shuffle (i, j, mask); }
+#define REAL_SHUFFLE1(T, MASK_T) T FN(gcc_3_vshf, T ## _DF) (T i, MASK_T mask) { return __builtin_shuffle (i, mask); }
+#define REAL_SHUFFLE2(T, MASK_T) T FN(gcc_4_vshf, T ## _DF) (T i, T j, MASK_T mask) { return __builtin_shuffle (i, j, mask); }
+
+#define ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(FUNC) \
+  FUNC (v16i8) \
+  FUNC (v8i16) \
+  FUNC (v4i32) \
+  FUNC (v2i64)
+
+#define ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2(FUNC) \
+  FUNC (v16i8) \
+  FUNC (v8i16) \
+  FUNC (v4i32)
+
+#define ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(FUNC) \
+  FUNC (v16u8) \
+  FUNC (v8u16) \
+  FUNC (v4u32) \
+  FUNC (v2u64)
+
+#define ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2(FUNC) \
+  FUNC (v16u8) \
+  FUNC (v8u16) \
+  FUNC (v4u32)
+
+#define ITERATE_FOR_ALL_REAL_VECTOR_TYPES(FUNC) \
+  FUNC (v4f32) \
+  FUNC (v2f64) \
+
+/* MSA Arithmetic builtins.  */
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ADDV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ADDVI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ADD_A)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ADDS_A)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ADDS_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (ADDS_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (HADD_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2 (HADD_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ASUB_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (ASUB_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (AVE_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (AVE_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (AVER_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (AVER_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (DOTP_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2 (DOTP_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (DPADD_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2 (DPADD_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (DPSUB_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2 (DPSUB_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (DIV_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (DIV_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MADDV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MAX_A)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MIN_A)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MAX_S)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MAXI_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (MAX_U)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (MAXI_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MIN_S)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MINI_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (MIN_U)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (MINI_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MSUBV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MULV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (MOD_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (MOD_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SAT_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SAT_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SUBS_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SUBS_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (HSUB_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES_2 (HSUB_U)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SUBSUU_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SUBSUS_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SUBV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SUBVI)
+
+/* MSA Bitwise builtins.  */
+AND (v16u8)
+ANDI (v16u8)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BCLR)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BCLRI)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BINSL)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BINSLI)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BINSR)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BINSRI)
+BMNZ (v16u8)
+BMNZI (v16u8)
+BMZ (v16u8)
+BMZI (v16u8)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BNEG)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BNEGI)
+BSEL (v16u8)
+BSELI (v16u8)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BSET)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BSETI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (NLOC)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (NLZC)
+NOR (v16u8)
+NORI (v16u8)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (PCNT)
+OR (v16u8)
+ORI (v16u8)
+XOR (v16u8)
+XORI (v16u8)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SLL)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SLLI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRA)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRAI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRAR)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRARI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRL)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRLI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRLR)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SRLRI)
+
+/* MSA Floating-Point Arithmetic builtins.  */
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FADD)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FDIV)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FEXP2)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FLOG2)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMADD)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMSUB)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMAX)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMIN)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMAX_A)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMIN_A)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FMUL)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FRCP)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FRINT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FRSQRT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSQRT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSUB)
+
+/* MSA Floating-Point Compare builtins.  */
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCLASS)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCAF)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCUN)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCOR)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCEQ)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCUNE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCUEQ)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCNE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCLT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCULT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCLE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FCULE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSAF)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSUN)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSOR)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSEQ)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSUNE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSUEQ)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSNE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSLT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSULT)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSLE)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FSULE)
+
+/* MSA Floating-Point Conversion builtins.  */
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FEXUPL)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FEXUPR)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FEXDO)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FFINT_S)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FFINT_U)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FFQL)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FFQR)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FTINT_S)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FTINT_U)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FTRUNC_S)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FTRUNC_U)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (FTQ)
+
+/* MSA Fixed-Point Multiplication builtins.  */
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MADD_Q)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MADDR_Q)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MSUB_Q)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MSUBR_Q)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MUL_Q)
+ITERATE_FOR_ALL_REAL_VECTOR_TYPES (MULR_Q)
+
+/* MSA Compare builtins.  */
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CEQ)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CEQI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CLE_S)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CLEI_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (CLE_U)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (CLEI_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CLT_S)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (CLTI_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (CLT_U)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (CLTI_U)
+
+/* MSA Branch builtins.  */
+BNZV (v16u8)
+BZV (v16u8)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BNZ)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (BZ)
+
+/* MSA Load/Store and Move builtins.  */
+CFCMSA ()
+CTCMSA ()
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (LD)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (LDI)
+MOVE (v16i8)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SPLAT)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SPLATI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (FILL)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (INSERT)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (INSVE)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (COPY_S)
+COPY_S_D (v2i64)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (COPY_U)
+COPY_U_D (v2i64)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ST)
+
+/* MSA Element Permute builtins.  */
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ILVEV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ILVOD)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ILVL)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (ILVR)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (PCKEV)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (PCKOD)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES_2 (SHF)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SLD)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SLDI)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (VSHF)
+
+/* GCC builtins.  */
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SHUFFLE1_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SHUFFLE1_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES (SHUFFLE2_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES (SHUFFLE2_U)
+REAL_SHUFFLE1 (v2f64, v2i64)
+REAL_SHUFFLE2 (v2f64, v2i64)
+REAL_SHUFFLE1 (v4f32, v4i32)
+REAL_SHUFFLE2 (v4f32, v4i32)
diff --git a/gcc/testsuite/gcc.target/mips/msa.c b/gcc/testsuite/gcc.target/mips/msa.c
new file mode 100644
index 00000000000..bd840c20bbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/msa.c
@@ -0,0 +1,630 @@
+/* Test MIPS MSA ASE instructions */
+/* { dg-do compile } */
+/* { dg-options "-mfp64 -mhard-float -mmsa" } */
+/* { dg-skip-if "madd and msub need combine" { *-*-* } { "-O0" "-flto" } { "" } } */
+
+/* { dg-final { scan-assembler-times "\t.comm\tv16i8_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv8i16_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv4i32_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv2i64_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv16u8_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv8u16_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv4u32_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv2u64_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv4f32_\\d+,16,16" 3 } } */
+/* { dg-final { scan-assembler-times "\t.comm\tv2f64_\\d+,16,16" 3 } } */
+
+/* { dg-final { scan-assembler "test0_v16i8.*:.*v16i8_0.*test0_v16i8" } } */
+/* { dg-final { scan-assembler "test0_v8i16.*:.*v8i16_0.*test0_v8i16" } } */
+/* { dg-final { scan-assembler "test0_v4i32.*:.*v4i32_0.*test0_v4i32" } } */
+/* { dg-final { scan-assembler "test0_v2i64.*:.*v2i64_0.*test0_v2i64" } } */
+/* { dg-final { scan-assembler "test0_v16u8.*:.*v16u8_0.*test0_v16u8" } } */
+/* { dg-final { scan-assembler "test0_v8u16.*:.*v8u16_0.*test0_v8u16" } } */
+/* { dg-final { scan-assembler "test0_v4u32.*:.*v4u32_0.*test0_v4u32" } } */
+/* { dg-final { scan-assembler "test0_v2u64.*:.*v2u64_0.*test0_v2u64" } } */
+/* { dg-final { scan-assembler "test0_v4f32.*:.*v4f32_0.*test0_v4f32" } } */
+/* { dg-final { scan-assembler "test0_v2f64.*:.*v2f64_0.*test0_v2f64" } } */
+/* { dg-final { scan-assembler "test1_v16i8.*:.*st.b.*test1_v16i8" } } */
+/* { dg-final { scan-assembler "test1_v8i16.*:.*st.h.*test1_v8i16" } } */
+/* { dg-final { scan-assembler "test1_v4i32.*:.*st.w.*test1_v4i32" } } */
+/* { dg-final { scan-assembler "test1_v2i64.*:.*st.d.*test1_v2i64" } } */
+/* { dg-final { scan-assembler "test1_v16u8.*:.*st.b.*test1_v16u8" } } */
+/* { dg-final { scan-assembler "test1_v8u16.*:.*st.h.*test1_v8u16" } } */
+/* { dg-final { scan-assembler "test1_v4u32.*:.*st.w.*test1_v4u32" } } */
+/* { dg-final { scan-assembler "test1_v2u64.*:.*st.d.*test1_v2u64" } } */
+/* { dg-final { scan-assembler "test1_v4f32.*:.*st.w.*test1_v4f32" } } */
+/* { dg-final { scan-assembler "test1_v2f64.*:.*st.d.*test1_v2f64" } } */
+/* { dg-final { scan-assembler "test2_v16i8.*:.*addv.b.*test2_v16i8" } } */
+/* { dg-final { scan-assembler "test2_v8i16.*:.*addv.h.*test2_v8i16" } } */
+/* { dg-final { scan-assembler "test2_v4i32.*:.*addv.w.*test2_v4i32" } } */
+/* { dg-final { scan-assembler "test2_v2i64.*:.*addv.d.*test2_v2i64" } } */
+/* { dg-final { scan-assembler "test2_v16u8.*:.*addv.b.*test2_v16u8" } } */
+/* { dg-final { scan-assembler "test2_v8u16.*:.*addv.h.*test2_v8u16" } } */
+/* { dg-final { scan-assembler "test2_v4u32.*:.*addv.w.*test2_v4u32" } } */
+/* { dg-final { scan-assembler "test2_v2u64.*:.*addv.d.*test2_v2u64" } } */
+/* { dg-final { scan-assembler "test2_v4f32.*:.*fadd.w.*test2_v4f32" } } */
+/* { dg-final { scan-assembler "test2_v2f64.*:.*fadd.d.*test2_v2f64" } } */
+/* { dg-final { scan-assembler "test3_v16i8.*:.*subv.b.*test3_v16i8" } } */
+/* { dg-final { scan-assembler "test3_v8i16.*:.*subv.h.*test3_v8i16" } } */
+/* { dg-final { scan-assembler "test3_v4i32.*:.*subv.w.*test3_v4i32" } } */
+/* { dg-final { scan-assembler "test3_v2i64.*:.*subv.d.*test3_v2i64" } } */
+/* { dg-final { scan-assembler "test3_v16u8.*:.*subv.b.*test3_v16u8" } } */
+/* { dg-final { scan-assembler "test3_v8u16.*:.*subv.h.*test3_v8u16" } } */
+/* { dg-final { scan-assembler "test3_v4u32.*:.*subv.w.*test3_v4u32" } } */
+/* { dg-final { scan-assembler "test3_v2u64.*:.*subv.d.*test3_v2u64" } } */
+/* { dg-final { scan-assembler "test3_v4f32.*:.*fsub.w.*test3_v4f32" } } */
+/* { dg-final { scan-assembler "test3_v2f64.*:.*fsub.d.*test3_v2f64" } } */
+/* { dg-final { scan-assembler "test4_v16i8.*:.*mulv.b.*test4_v16i8" } } */
+/* { dg-final { scan-assembler "test4_v8i16.*:.*mulv.h.*test4_v8i16" } } */
+/* { dg-final { scan-assembler "test4_v4i32.*:.*mulv.w.*test4_v4i32" } } */
+/* { dg-final { scan-assembler "test4_v2i64.*:.*mulv.d.*test4_v2i64" } } */
+/* { dg-final { scan-assembler "test4_v16u8.*:.*mulv.b.*test4_v16u8" } } */
+/* { dg-final { scan-assembler "test4_v8u16.*:.*mulv.h.*test4_v8u16" } } */
+/* { dg-final { scan-assembler "test4_v4u32.*:.*mulv.w.*test4_v4u32" } } */
+/* { dg-final { scan-assembler "test4_v2u64.*:.*mulv.d.*test4_v2u64" } } */
+/* { dg-final { scan-assembler "test4_v4f32.*:.*fmul.w.*test4_v4f32" } } */
+/* { dg-final { scan-assembler "test4_v2f64.*:.*fmul.d.*test4_v2f64" } } */
+/* { dg-final { scan-assembler "test5_v16i8.*:.*div_s.b.*test5_v16i8" } } */
+/* { dg-final { scan-assembler "test5_v8i16.*:.*div_s.h.*test5_v8i16" } } */
+/* { dg-final { scan-assembler "test5_v4i32.*:.*div_s.w.*test5_v4i32" } } */
+/* { dg-final { scan-assembler "test5_v2i64.*:.*div_s.d.*test5_v2i64" } } */
+/* { dg-final { scan-assembler "test5_v16u8.*:.*div_u.b.*test5_v16u8" } } */
+/* { dg-final { scan-assembler "test5_v8u16.*:.*div_u.h.*test5_v8u16" } } */
+/* { dg-final { scan-assembler "test5_v4u32.*:.*div_u.w.*test5_v4u32" } } */
+/* { dg-final { scan-assembler "test5_v2u64.*:.*div_u.d.*test5_v2u64" } } */
+/* { dg-final { scan-assembler "test5_v4f32.*:.*fdiv.w.*test5_v4f32" } } */
+/* { dg-final { scan-assembler "test5_v2f64.*:.*fdiv.d.*test5_v2f64" } } */
+/* { dg-final { scan-assembler "test6_v16i8.*:.*mod_s.b.*test6_v16i8" } } */
+/* { dg-final { scan-assembler "test6_v8i16.*:.*mod_s.h.*test6_v8i16" } } */
+/* { dg-final { scan-assembler "test6_v4i32.*:.*mod_s.w.*test6_v4i32" } } */
+/* { dg-final { scan-assembler "test6_v2i64.*:.*mod_s.d.*test6_v2i64" } } */
+/* { dg-final { scan-assembler "test6_v16u8.*:.*mod_u.b.*test6_v16u8" } } */
+/* { dg-final { scan-assembler "test6_v8u16.*:.*mod_u.h.*test6_v8u16" } } */
+/* { dg-final { scan-assembler "test6_v4u32.*:.*mod_u.w.*test6_v4u32" } } */
+/* { dg-final { scan-assembler "test6_v2u64.*:.*mod_u.d.*test6_v2u64" } } */
+/* { dg-final { scan-assembler "test7_v16i8.*:.*subv.b.*test7_v16i8" } } */
+/* { dg-final { scan-assembler "test7_v8i16.*:.*subv.h.*test7_v8i16" } } */
+/* { dg-final { scan-assembler "test7_v4i32.*:.*subv.w.*test7_v4i32" } } */
+/* { dg-final { scan-assembler "test7_v2i64.*:.*subv.d.*test7_v2i64" } } */
+/* { dg-final { scan-assembler "test7_v16u8.*:.*subv.b.*test7_v16u8" } } */
+/* { dg-final { scan-assembler "test7_v8u16.*:.*subv.h.*test7_v8u16" } } */
+/* { dg-final { scan-assembler "test7_v4u32.*:.*subv.w.*test7_v4u32" } } */
+/* { dg-final { scan-assembler "test7_v2u64.*:.*subv.d.*test7_v2u64" } } */
+/* { dg-final { scan-assembler "test7_v4f32.*:.*fsub.w.*test7_v4f32" } } */
+/* { dg-final { scan-assembler "test7_v2f64.*:.*fsub.d.*test7_v2f64" } } */
+/* { dg-final { scan-assembler "test8_v16i8.*:.*xor.v.*test8_v16i8" } } */
+/* { dg-final { scan-assembler "test8_v8i16.*:.*xor.v.*test8_v8i16" } } */
+/* { dg-final { scan-assembler "test8_v4i32.*:.*xor.v.*test8_v4i32" } } */
+/* { dg-final { scan-assembler "test8_v2i64.*:.*xor.v.*test8_v2i64" } } */
+/* { dg-final { scan-assembler "test8_v16u8.*:.*xor.v.*test8_v16u8" } } */
+/* { dg-final { scan-assembler "test8_v8u16.*:.*xor.v.*test8_v8u16" } } */
+/* { dg-final { scan-assembler "test8_v4u32.*:.*xor.v.*test8_v4u32" } } */
+/* { dg-final { scan-assembler "test8_v2u64.*:.*xor.v.*test8_v2u64" } } */
+/* { dg-final { scan-assembler "test9_v16i8.*:.*or.v.*test9_v16i8" } } */
+/* { dg-final { scan-assembler "test9_v8i16.*:.*or.v.*test9_v8i16" } } */
+/* { dg-final { scan-assembler "test9_v4i32.*:.*or.v.*test9_v4i32" } } */
+/* { dg-final { scan-assembler "test9_v2i64.*:.*or.v.*test9_v2i64" } } */
+/* { dg-final { scan-assembler "test9_v16u8.*:.*or.v.*test9_v16u8" } } */
+/* { dg-final { scan-assembler "test9_v8u16.*:.*or.v.*test9_v8u16" } } */
+/* { dg-final { scan-assembler "test9_v4u32.*:.*or.v.*test9_v4u32" } } */
+/* { dg-final { scan-assembler "test9_v2u64.*:.*or.v.*test9_v2u64" } } */
+/* { dg-final { scan-assembler "test10_v16i8.*:.*and.v.*test10_v16i8" } } */
+/* { dg-final { scan-assembler "test10_v8i16.*:.*and.v.*test10_v8i16" } } */
+/* { dg-final { scan-assembler "test10_v4i32.*:.*and.v.*test10_v4i32" } } */
+/* { dg-final { scan-assembler "test10_v2i64.*:.*and.v.*test10_v2i64" } } */
+/* { dg-final { scan-assembler "test10_v16u8.*:.*and.v.*test10_v16u8" } } */
+/* { dg-final { scan-assembler "test10_v8u16.*:.*and.v.*test10_v8u16" } } */
+/* { dg-final { scan-assembler "test10_v4u32.*:.*and.v.*test10_v4u32" } } */
+/* { dg-final { scan-assembler "test10_v2u64.*:.*and.v.*test10_v2u64" } } */
+/* { dg-final { scan-assembler "test11_v16i8.*:.*nor.v.*test11_v16i8" } } */
+/* { dg-final { scan-assembler "test11_v8i16.*:.*nor.v.*test11_v8i16" } } */
+/* { dg-final { scan-assembler "test11_v4i32.*:.*nor.v.*test11_v4i32" } } */
+/* { dg-final { scan-assembler "test11_v2i64.*:.*nor.v.*test11_v2i64" } } */
+/* { dg-final { scan-assembler "test11_v16u8.*:.*nor.v.*test11_v16u8" } } */
+/* { dg-final { scan-assembler "test11_v8u16.*:.*nor.v.*test11_v8u16" } } */
+/* { dg-final { scan-assembler "test11_v4u32.*:.*nor.v.*test11_v4u32" } } */
+/* { dg-final { scan-assembler "test11_v2u64.*:.*nor.v.*test11_v2u64" } } */
+/* { dg-final { scan-assembler "test12_v16i8.*:.*sra.b.*test12_v16i8" } } */
+/* { dg-final { scan-assembler "test12_v8i16.*:.*sra.h.*test12_v8i16" } } */
+/* { dg-final { scan-assembler "test12_v4i32.*:.*sra.w.*test12_v4i32" } } */
+/* { dg-final { scan-assembler "test12_v2i64.*:.*sra.d.*test12_v2i64" } } */
+/* { dg-final { scan-assembler "test12_v16u8.*:.*srl.b.*test12_v16u8" } } */
+/* { dg-final { scan-assembler "test12_v8u16.*:.*srl.h.*test12_v8u16" } } */
+/* { dg-final { scan-assembler "test12_v4u32.*:.*srl.w.*test12_v4u32" } } */
+/* { dg-final { scan-assembler "test12_v2u64.*:.*srl.d.*test12_v2u64" } } */
+/* { dg-final { scan-assembler "test13_v16i8.*:.*sll.b.*test13_v16i8" } } */
+/* { dg-final { scan-assembler "test13_v8i16.*:.*sll.h.*test13_v8i16" } } */
+/* { dg-final { scan-assembler "test13_v4i32.*:.*sll.w.*test13_v4i32" } } */
+/* { dg-final { scan-assembler "test13_v2i64.*:.*sll.d.*test13_v2i64" } } */
+/* { dg-final { scan-assembler "test13_v16u8.*:.*sll.b.*test13_v16u8" } } */
+/* { dg-final { scan-assembler "test13_v8u16.*:.*sll.h.*test13_v8u16" } } */
+/* { dg-final { scan-assembler "test13_v4u32.*:.*sll.w.*test13_v4u32" } } */
+/* { dg-final { scan-assembler "test13_v2u64.*:.*sll.d.*test13_v2u64" } } */
+/* { dg-final { scan-assembler "test14_v16i8.*:.*ceq.b.*test14_v16i8" } } */
+/* { dg-final { scan-assembler "test14_v8i16.*:.*ceq.h.*test14_v8i16" } } */
+/* { dg-final { scan-assembler "test14_v4i32.*:.*ceq.w.*test14_v4i32" } } */
+/* { dg-final { scan-assembler "test14_v2i64.*:.*ceq.d.*test14_v2i64" } } */
+/* { dg-final { scan-assembler "test14_v16u8.*:.*ceq.b.*test14_v16u8" } } */
+/* { dg-final { scan-assembler "test14_v8u16.*:.*ceq.h.*test14_v8u16" } } */
+/* { dg-final { scan-assembler "test14_v4u32.*:.*ceq.w.*test14_v4u32" } } */
+/* { dg-final { scan-assembler "test14_v2u64.*:.*ceq.d.*test14_v2u64" } } */
+/* { dg-final { scan-assembler "test14_v4f32.*:.*fceq.w.*test14_v4f32" } } */
+/* { dg-final { scan-assembler "test14_v2f64.*:.*fceq.d.*test14_v2f64" } } */
+/* { dg-final { scan-assembler "test15_v16i8.*:.*ceq.b.*nor.v.*test15_v16i8" } } */
+/* { dg-final { scan-assembler "test15_v8i16.*:.*ceq.h.*nor.v.*test15_v8i16" } } */
+/* { dg-final { scan-assembler "test15_v4i32.*:.*ceq.w.*nor.v.*test15_v4i32" } } */
+/* { dg-final { scan-assembler "test15_v2i64.*:.*ceq.d.*nor.v.*test15_v2i64" } } */
+/* { dg-final { scan-assembler "test15_v16u8.*:.*ceq.b.*nor.v.*test15_v16u8" } } */
+/* { dg-final { scan-assembler "test15_v8u16.*:.*ceq.h.*nor.v.*test15_v8u16" } } */
+/* { dg-final { scan-assembler "test15_v4u32.*:.*ceq.w.*nor.v.*test15_v4u32" } } */
+/* { dg-final { scan-assembler "test15_v2u64.*:.*ceq.d.*nor.v.*test15_v2u64" } } */
+/* { dg-final { scan-assembler "test15_v4f32.*:.*fcne.w.*test15_v4f32" } } */
+/* { dg-final { scan-assembler "test15_v2f64.*:.*fcne.d.*test15_v2f64" } } */
+/* { dg-final { scan-assembler "test16_v16i8.*:.*clt_s.b.*test16_v16i8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v8i16.*:.*clt_s.h.*test16_v8i16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v4i32.*:.*clt_s.w.*test16_v4i32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v2i64.*:.*clt_s.d.*test16_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v16u8.*:.*clt_u.b.*test16_v16u8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v8u16.*:.*clt_u.h.*test16_v8u16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v4u32.*:.*clt_u.w.*test16_v4u32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v2u64.*:.*clt_u.d.*test16_v2u64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v4f32.*:.*fslt.w.*test16_v4f32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v2f64.*:.*fslt.d.*test16_v2f64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test16_v16i8.*:.*clt_s.b.*test16_v16i8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v8i16.*:.*clt_s.h.*test16_v8i16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v4i32.*:.*clt_s.w.*test16_v4i32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v2i64.*:.*clt_s.d.*test16_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v16u8.*:.*clt_u.b.*test16_v16u8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v8u16.*:.*clt_u.h.*test16_v8u16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v4u32.*:.*clt_u.w.*test16_v4u32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v2u64.*:.*clt_u.d.*test16_v2u64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v4f32.*:.*fslt.w.*test16_v4f32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test16_v2f64.*:.*fslt.d.*test16_v2f64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v16i8.*:.*cle_s.b.*test17_v16i8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v8i16.*:.*cle_s.h.*test17_v8i16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v4i32.*:.*cle_s.w.*test17_v4i32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v2i64.*:.*cle_s.d.*test17_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v16u8.*:.*cle_u.b.*test17_v16u8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v8u16.*:.*cle_u.h.*test17_v8u16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v4u32.*:.*cle_u.w.*test17_v4u32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v2u64.*:.*cle_u.d.*test17_v2u64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v4f32.*:.*fsle.w.*test17_v4f32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v2f64.*:.*fsle.d.*test17_v2f64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test17_v16i8.*:.*cle_s.b.*test17_v16i8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v8i16.*:.*cle_s.h.*test17_v8i16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v4i32.*:.*cle_s.w.*test17_v4i32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v2i64.*:.*cle_s.d.*test17_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v16u8.*:.*cle_u.b.*test17_v16u8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v8u16.*:.*cle_u.h.*test17_v8u16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v4u32.*:.*cle_u.w.*test17_v4u32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v2u64.*:.*cle_u.d.*test17_v2u64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v4f32.*:.*fsle.w.*test17_v4f32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test17_v2f64.*:.*fsle.d.*test17_v2f64" { target {! mips64 } } } } */
+/* Note: For reversed comparison the compare instruction is the same with vectors swapped.  */
+/* { dg-final { scan-assembler "test18_v16i8.*:.*clt_s.b.*test18_v16i8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v8i16.*:.*clt_s.h.*test18_v8i16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v4i32.*:.*clt_s.w.*test18_v4i32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v2i64.*:.*clt_s.d.*test18_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v16u8.*:.*clt_u.b.*test18_v16u8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v8u16.*:.*clt_u.h.*test18_v8u16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v4u32.*:.*clt_u.w.*test18_v4u32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v2u64.*:.*clt_u.d.*test18_v2u64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v4f32.*:.*fslt.w.*test18_v4f32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v2f64.*:.*fslt.d.*test18_v2f64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test18_v16i8.*:.*clt_s.b.*test18_v16i8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v8i16.*:.*clt_s.h.*test18_v8i16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v4i32.*:.*clt_s.w.*test18_v4i32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v2i64.*:.*clt_s.d.*test18_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v16u8.*:.*clt_u.b.*test18_v16u8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v8u16.*:.*clt_u.h.*test18_v8u16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v4u32.*:.*clt_u.w.*test18_v4u32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v2u64.*:.*clt_u.d.*test18_v2u64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v4f32.*:.*fslt.w.*test18_v4f32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test18_v2f64.*:.*fslt.d.*test18_v2f64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v16i8.*:.*cle_s.b.*test19_v16i8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v8i16.*:.*cle_s.h.*test19_v8i16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v4i32.*:.*cle_s.w.*test19_v4i32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v2i64.*:.*cle_s.d.*test19_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v16u8.*:.*cle_u.b.*test19_v16u8" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v8u16.*:.*cle_u.h.*test19_v8u16" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v4u32.*:.*cle_u.w.*test19_v4u32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v2u64.*:.*cle_u.d.*test19_v2u64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v4f32.*:.*fsle.w.*test19_v4f32" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v2f64.*:.*fsle.d.*test19_v2f64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test19_v16i8.*:.*cle_s.b.*test19_v16i8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v8i16.*:.*cle_s.h.*test19_v8i16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v4i32.*:.*cle_s.w.*test19_v4i32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v2i64.*:.*cle_s.d.*test19_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v16u8.*:.*cle_u.b.*test19_v16u8" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v8u16.*:.*cle_u.h.*test19_v8u16" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v4u32.*:.*cle_u.w.*test19_v4u32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v2u64.*:.*cle_u.d.*test19_v2u64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v4f32.*:.*fsle.w.*test19_v4f32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test19_v2f64.*:.*fsle.d.*test19_v2f64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test20_v16i8.*:.*addvi.b.*test20_v16i8" } } */
+/* { dg-final { scan-assembler "test20_v8i16.*:.*addvi.h.*test20_v8i16" } } */
+/* { dg-final { scan-assembler "test20_v4i32.*:.*addvi.w.*test20_v4i32" } } */
+/* { dg-final { scan-assembler "test20_v2i64.*:.*addvi.d.*test20_v2i64" } } */
+/* { dg-final { scan-assembler "test20_v16u8.*:.*addvi.b.*test20_v16u8" } } */
+/* { dg-final { scan-assembler "test20_v8u16.*:.*addvi.h.*test20_v8u16" } } */
+/* { dg-final { scan-assembler "test20_v4u32.*:.*addvi.w.*test20_v4u32" } } */
+/* { dg-final { scan-assembler "test20_v2u64.*:.*addvi.d.*test20_v2u64" } } */
+/* { dg-final { scan-assembler "test21_v16i8.*:.*subvi.b.*test21_v16i8" } } */
+/* { dg-final { scan-assembler "test21_v8i16.*:.*subvi.h.*test21_v8i16" } } */
+/* { dg-final { scan-assembler "test21_v4i32.*:.*subvi.w.*test21_v4i32" } } */
+/* { dg-final { scan-assembler "test21_v2i64.*:.*subvi.d.*test21_v2i64" } } */
+/* { dg-final { scan-assembler "test21_v16u8.*:.*subvi.b.*test21_v16u8" } } */
+/* { dg-final { scan-assembler "test21_v8u16.*:.*subvi.h.*test21_v8u16" } } */
+/* { dg-final { scan-assembler "test21_v4u32.*:.*subvi.w.*test21_v4u32" } } */
+/* { dg-final { scan-assembler "test21_v2u64.*:.*subvi.d.*test21_v2u64" } } */
+/* Note: the output varies across optimizations levels but limited to two variants.  */
+/* { dg-final { scan-assembler "test22_v16i8.*:.*(ldi.b.*37.*mulv.b|slli.b.*addv.b).*test22_v16i8" } } */
+/* { dg-final { scan-assembler "test22_v8i16.*:.*(ldi.h.*37.*mulv.h|slli.h.*addv.h).*test22_v8i16" } } */
+/* { dg-final { scan-assembler "test22_v4i32.*:.*(ldi.w.*37.*mulv.w|slli.w.*addv.w).*test22_v4i32" } } */
+/* { dg-final { scan-assembler "test22_v2i64.*:.*(ldi.d.*37.*mulv.d|slli.d.*addv.d).*test22_v2i64" } } */
+/* { dg-final { scan-assembler "test22_v16u8.*:.*(ldi.b.*37.*mulv.b|slli.b.*addv.b).*test22_v16u8" } } */
+/* { dg-final { scan-assembler "test22_v8u16.*:.*(ldi.h.*37.*mulv.h|slli.h.*addv.h).*test22_v8u16" } } */
+/* { dg-final { scan-assembler "test22_v4u32.*:.*(ldi.w.*37.*mulv.w|slli.w.*addv.w).*test22_v4u32" } } */
+/* { dg-final { scan-assembler "test22_v2u64.*:.*(ldi.d.*37.*mulv.d|slli.d.*addv.d).*test22_v2u64" } } */
+/* { dg-final { scan-assembler "test23_v16i8.*:.*ldi.b\t\\\$w\\d+,37.*div_s.b.*test23_v16i8" } } */
+/* { dg-final { scan-assembler "test23_v8i16.*:.*ldi.h\t\\\$w\\d+,37.*div_s.h.*test23_v8i16" } } */
+/* { dg-final { scan-assembler "test23_v4i32.*:.*ldi.w\t\\\$w\\d+,37.*div_s.w.*test23_v4i32" } } */
+/* { dg-final { scan-assembler "test23_v2i64.*:.*ldi.d\t\\\$w\\d+,37.*div_s.d.*test23_v2i64" } } */
+/* { dg-final { scan-assembler "test23_v16u8.*:.*ldi.b\t\\\$w\\d+,37.*div_u.b.*test23_v16u8" } } */
+/* { dg-final { scan-assembler "test23_v8u16.*:.*ldi.h\t\\\$w\\d+,37.*div_u.h.*test23_v8u16" } } */
+/* { dg-final { scan-assembler "test23_v4u32.*:.*ldi.w\t\\\$w\\d+,37.*div_u.w.*test23_v4u32" } } */
+/* { dg-final { scan-assembler "test23_v2u64.*:.*ldi.d\t\\\$w\\d+,37.*div_u.d.*test23_v2u64" } } */
+/* { dg-final { scan-assembler "test24_v16i8.*:.*ldi.b\t\\\$w\\d+,37.*mod_s.b.*test24_v16i8" } } */
+/* { dg-final { scan-assembler "test24_v8i16.*:.*ldi.h\t\\\$w\\d+,37.*mod_s.h.*test24_v8i16" } } */
+/* { dg-final { scan-assembler "test24_v4i32.*:.*ldi.w\t\\\$w\\d+,37.*mod_s.w.*test24_v4i32" } } */
+/* { dg-final { scan-assembler "test24_v2i64.*:.*ldi.d\t\\\$w\\d+,37.*mod_s.d.*test24_v2i64" } } */
+/* { dg-final { scan-assembler "test24_v16u8.*:.*ldi.b\t\\\$w\\d+,37.*mod_u.b.*test24_v16u8" } } */
+/* { dg-final { scan-assembler "test24_v8u16.*:.*ldi.h\t\\\$w\\d+,37.*mod_u.h.*test24_v8u16" } } */
+/* { dg-final { scan-assembler "test24_v4u32.*:.*ldi.w\t\\\$w\\d+,37.*mod_u.w.*test24_v4u32" } } */
+/* { dg-final { scan-assembler "test24_v2u64.*:.*ldi.d\t\\\$w\\d+,37.*mod_u.d.*test24_v2u64" } } */
+/* { dg-final { scan-assembler "test25_v16i8.*:.*xori.b.*test25_v16i8" } } */
+/* { dg-final { scan-assembler "test25_v8i16.*:.*ldi.h\t\\\$w\\d+,37.*xor.v.*test25_v8i16" } } */
+/* { dg-final { scan-assembler "test25_v4i32.*:.*ldi.w\t\\\$w\\d+,37.*xor.v.*test25_v4i32" } } */
+/* { dg-final { scan-assembler "test25_v2i64.*:.*ldi.d\t\\\$w\\d+,37.*xor.v.*test25_v2i64" } } */
+/* { dg-final { scan-assembler "test25_v16u8.*:.*xori.b.*test25_v16u8" } } */
+/* { dg-final { scan-assembler "test25_v8u16.*:.*ldi.h\t\\\$w\\d+,37.*xor.v.*test25_v8u16" } } */
+/* { dg-final { scan-assembler "test25_v4u32.*:.*ldi.w\t\\\$w\\d+,37.*xor.v.*test25_v4u32" } } */
+/* { dg-final { scan-assembler "test25_v2u64.*:.*ldi.d\t\\\$w\\d+,37.*xor.v.*test25_v2u64" } } */
+/* { dg-final { scan-assembler "test26_v16i8.*:.*ori.b.*test26_v16i8" } } */
+/* { dg-final { scan-assembler "test26_v8i16.*:.*ldi.h\t\\\$w\\d+,37.*or.v.*test26_v8i16" } } */
+/* { dg-final { scan-assembler "test26_v4i32.*:.*ldi.w\t\\\$w\\d+,37.*or.v.*test26_v4i32" } } */
+/* { dg-final { scan-assembler "test26_v2i64.*:.*ldi.d\t\\\$w\\d+,37.*or.v.*test26_v2i64" } } */
+/* { dg-final { scan-assembler "test26_v16u8.*:.*ori.b.*test26_v16u8" } } */
+/* { dg-final { scan-assembler "test26_v8u16.*:.*ldi.h\t\\\$w\\d+,37.*or.v.*test26_v8u16" } } */
+/* { dg-final { scan-assembler "test26_v4u32.*:.*ldi.w\t\\\$w\\d+,37.*or.v.*test26_v4u32" } } */
+/* { dg-final { scan-assembler "test26_v2u64.*:.*ldi.d\t\\\$w\\d+,37.*or.v.*test26_v2u64" } } */
+/* { dg-final { scan-assembler "test27_v16i8.*:.*andi.b.*test27_v16i8" } } */
+/* { dg-final { scan-assembler "test27_v8i16.*:.*ldi.h\t\\\$w\\d+,37.*and.v.*test27_v8i16" } } */
+/* { dg-final { scan-assembler "test27_v4i32.*:.*ldi.w\t\\\$w\\d+,37.*and.v.*test27_v4i32" } } */
+/* { dg-final { scan-assembler "test27_v2i64.*:.*ldi.d\t\\\$w\\d+,37.*and.v.*test27_v2i64" } } */
+/* { dg-final { scan-assembler "test27_v16u8.*:.*andi.b.*test27_v16u8" } } */
+/* { dg-final { scan-assembler "test27_v8u16.*:.*ldi.h\t\\\$w\\d+,37.*and.v.*test27_v8u16" } } */
+/* { dg-final { scan-assembler "test27_v4u32.*:.*ldi.w\t\\\$w\\d+,37.*and.v.*test27_v4u32" } } */
+/* { dg-final { scan-assembler "test27_v2u64.*:.*ldi.d\t\\\$w\\d+,37.*and.v.*test27_v2u64" } } */
+/* { dg-final { scan-assembler "test28_v16i8.*:.*srai.b.*test28_v16i8" } } */
+/* { dg-final { scan-assembler "test28_v8i16.*:.*srai.h.*test28_v8i16" } } */
+/* { dg-final { scan-assembler "test28_v4i32.*:.*srai.w.*test28_v4i32" } } */
+/* { dg-final { scan-assembler "test28_v2i64.*:.*srai.d.*test28_v2i64" } } */
+/* { dg-final { scan-assembler "test28_v16u8.*:.*srli.b.*test28_v16u8" } } */
+/* { dg-final { scan-assembler "test28_v8u16.*:.*srli.h.*test28_v8u16" } } */
+/* { dg-final { scan-assembler "test28_v4u32.*:.*srli.w.*test28_v4u32" } } */
+/* { dg-final { scan-assembler "test28_v2u64.*:.*srli.d.*test28_v2u64" } } */
+/* { dg-final { scan-assembler "test29_v16i8.*:.*slli.b.*test29_v16i8" } } */
+/* { dg-final { scan-assembler "test29_v8i16.*:.*slli.h.*test29_v8i16" } } */
+/* { dg-final { scan-assembler "test29_v4i32.*:.*slli.w.*test29_v4i32" } } */
+/* { dg-final { scan-assembler "test29_v2i64.*:.*slli.d.*test29_v2i64" } } */
+/* { dg-final { scan-assembler "test29_v16u8.*:.*slli.b.*test29_v16u8" } } */
+/* { dg-final { scan-assembler "test29_v8u16.*:.*slli.h.*test29_v8u16" } } */
+/* { dg-final { scan-assembler "test29_v4u32.*:.*slli.w.*test29_v4u32" } } */
+/* { dg-final { scan-assembler "test29_v2u64.*:.*slli.d.*test29_v2u64" } } */
+/* { dg-final { scan-assembler "test30_v16i8.*:.*ceqi.b.*test30_v16i8" } } */
+/* { dg-final { scan-assembler "test30_v8i16.*:.*ceqi.h.*test30_v8i16" } } */
+/* { dg-final { scan-assembler "test30_v4i32.*:.*ceqi.w.*test30_v4i32" } } */
+/* { dg-final { scan-assembler "test30_v2i64.*:.*ceqi.d.*test30_v2i64" } } */
+/* { dg-final { scan-assembler "test30_v16u8.*:.*ceqi.b.*test30_v16u8" } } */
+/* { dg-final { scan-assembler "test30_v8u16.*:.*ceqi.h.*test30_v8u16" } } */
+/* { dg-final { scan-assembler "test30_v4u32.*:.*ceqi.w.*test30_v4u32" } } */
+/* { dg-final { scan-assembler "test30_v2u64.*:.*ceqi.d.*test30_v2u64" } } */
+/* { dg-final { scan-assembler "test31_s_v16i8.*:.*clti_s.b.*test31_s_v16i8" } } */
+/* { dg-final { scan-assembler "test31_s_v8i16.*:.*clti_s.h.*test31_s_v8i16" } } */
+/* { dg-final { scan-assembler "test31_s_v4i32.*:.*clti_s.w.*test31_s_v4i32" } } */
+/* { dg-final { scan-assembler "test31_s_v2i64.*:.*clti_s.d.*test31_s_v2i64" } } */
+/* { dg-final { scan-assembler "test31_u_v16u8.*:.*clti_u.b.*test31_u_v16u8" } } */
+/* { dg-final { scan-assembler "test31_u_v8u16.*:.*clti_u.h.*test31_u_v8u16" } } */
+/* { dg-final { scan-assembler "test31_u_v4u32.*:.*clti_u.w.*test31_u_v4u32" } } */
+/* { dg-final { scan-assembler "test31_u_v2u64.*:.*clti_u.d.*test31_u_v2u64" } } */
+/* { dg-final { scan-assembler "test32_s_v16i8.*:.*clei_s.b.*test32_s_v16i8" } } */
+/* { dg-final { scan-assembler "test32_s_v8i16.*:.*clei_s.h.*test32_s_v8i16" } } */
+/* { dg-final { scan-assembler "test32_s_v4i32.*:.*clei_s.w.*test32_s_v4i32" } } */
+/* { dg-final { scan-assembler "test32_s_v2i64.*:.*clei_s.d.*test32_s_v2i64" } } */
+/* { dg-final { scan-assembler "test32_u_v16u8.*:.*clei_u.b.*test32_u_v16u8" } } */
+/* { dg-final { scan-assembler "test32_u_v8u16.*:.*clei_u.h.*test32_u_v8u16" } } */
+/* { dg-final { scan-assembler "test32_u_v4u32.*:.*clei_u.w.*test32_u_v4u32" } } */
+/* { dg-final { scan-assembler "test32_u_v2u64.*:.*clei_u.d.*test32_u_v2u64" } } */
+/* { dg-final { scan-assembler "test33_v4f32.*:.*fadd.w.*test33_v4f32" } } */
+/* { dg-final { scan-assembler "test33_v2f64.*:.*fadd.d.*test33_v2f64" } } */
+/* { dg-final { scan-assembler "test34_v4f32.*:.*fsub.w.*test34_v4f32" } } */
+/* { dg-final { scan-assembler "test34_v2f64.*:.*fsub.d.*test34_v2f64" } } */
+/* { dg-final { scan-assembler "test35_v4f32.*:.*fmul.w.*test35_v4f32" } } */
+/* { dg-final { scan-assembler "test35_v2f64.*:.*fmul.d.*test35_v2f64" } } */
+/* { dg-final { scan-assembler "test36_v4f32.*:.*fdiv.w.*test36_v4f32" } } */
+/* { dg-final { scan-assembler "test36_v2f64.*:.*fdiv.d.*test36_v2f64" } } */
+/* { dg-final { scan-assembler "test37_v16i8.*:.*maddv.b.*test37_v16i8" } } */
+/* { dg-final { scan-assembler "test37_v8i16.*:.*maddv.h.*test37_v8i16" } } */
+/* { dg-final { scan-assembler "test37_v4i32.*:.*maddv.w.*test37_v4i32" } } */
+/* { dg-final { scan-assembler "test37_v2i64.*:.*maddv.d.*test37_v2i64" } } */
+/* { dg-final { scan-assembler "test37_v16u8.*:.*maddv.b.*test37_v16u8" } } */
+/* { dg-final { scan-assembler "test37_v8u16.*:.*maddv.h.*test37_v8u16" } } */
+/* { dg-final { scan-assembler "test37_v4u32.*:.*maddv.w.*test37_v4u32" } } */
+/* { dg-final { scan-assembler "test37_v2u64.*:.*maddv.d.*test37_v2u64" } } */
+/* { dg-final { scan-assembler "test37_v4f32.*:.*fmadd.w.*test37_v4f32" } } */
+/* { dg-final { scan-assembler "test37_v2f64.*:.*fmadd.d.*test37_v2f64" } } */
+/* { dg-final { scan-assembler "test38_v16i8.*:.*msubv.b.*test38_v16i8" } } */
+/* { dg-final { scan-assembler "test38_v8i16.*:.*msubv.h.*test38_v8i16" } } */
+/* { dg-final { scan-assembler "test38_v4i32.*:.*msubv.w.*test38_v4i32" } } */
+/* { dg-final { scan-assembler "test38_v2i64.*:.*msubv.d.*test38_v2i64" } } */
+/* { dg-final { scan-assembler "test38_v16u8.*:.*msubv.b.*test38_v16u8" } } */
+/* { dg-final { scan-assembler "test38_v8u16.*:.*msubv.h.*test38_v8u16" } } */
+/* { dg-final { scan-assembler "test38_v4u32.*:.*msubv.w.*test38_v4u32" } } */
+/* { dg-final { scan-assembler "test38_v2u64.*:.*msubv.d.*test38_v2u64" } } */
+/* { dg-final { scan-assembler "test38_v4f32.*:.*fmsub.w.*test38_v4f32" } } */
+/* { dg-final { scan-assembler "test38_v2f64.*:.*fmsub.d.*test38_v2f64" } } */
+/* { dg-final { scan-assembler "test39_v16i8.*:.*ld.b.*test39_v16i8" } } */
+/* { dg-final { scan-assembler "test39_v8i16.*:.*ld.h.*test39_v8i16" } } */
+/* { dg-final { scan-assembler "test39_v4i32.*:.*ld.w.*test39_v4i32" } } */
+/* { dg-final { scan-assembler "test39_v2i64.*:.*ld.d.*test39_v2i64" } } */
+/* { dg-final { scan-assembler "test40_min_v16i8.*:.*ldi.b\t\\\$w\\d+,-128.*test40_min_v16i8" } } */
+/* { dg-final { scan-assembler "test40_min_v8i16.*:.*ldi.h\t\\\$w\\d+,-512.*test40_min_v8i16" } } */
+/* { dg-final { scan-assembler "test40_min_v4i32.*:.*ldi.w\t\\\$w\\d+,-512.*test40_min_v4i32" } } */
+/* { dg-final { scan-assembler "test40_min_v2i64.*:.*ldi.d\t\\\$w\\d+,-512.*test40_min_v2i64" } } */
+/* { dg-final { scan-assembler "test40_max_v16i8.*:.*ldi.b\t\\\$w\\d+,127.*test40_max_v16i8" } } */
+/* { dg-final { scan-assembler "test40_max_v8i16.*:.*ldi.h\t\\\$w\\d+,511.*test40_max_v8i16" } } */
+/* { dg-final { scan-assembler "test40_max_v4i32.*:.*ldi.w\t\\\$w\\d+,511.*test40_max_v4i32" } } */
+/* { dg-final { scan-assembler "test40_max_v2i64.*:.*ldi.d\t\\\$w\\d+,511.*test40_max_v2i64" } } */
+/* { dg-final { scan-assembler "test41_v16i8.*:.*fill.b.*test41_v16i8" } } */
+/* { dg-final { scan-assembler "test41_v8i16.*:.*fill.h.*test41_v8i16" } } */
+/* { dg-final { scan-assembler "test41_v4i32.*:.*fill.w.*test41_v4i32" } } */
+/* Note: fill.d only available on MIPS64, replaced with equivalent on MIPS32.  */
+/* { dg-final { scan-assembler "test41_v2i64.*:.*fill.d.*test41_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test41_v2i64.*:.*fill.w.*insert.w.*test41_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test42_v16i8.*:.*insert.b.*test42_v16i8" } } */
+/* { dg-final { scan-assembler "test42_v8i16.*:.*insert.h.*test42_v8i16" } } */
+/* { dg-final { scan-assembler "test42_v4i32.*:.*insert.w.*test42_v4i32" } } */
+/* Note: insert.d only available on MIPS64, replaced with equivalent on MIPS32.  */
+/* { dg-final { scan-assembler "test42_v2i64.*:.*insert.d.*test42_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test42_v2i64.*:.*\(.*insert.w\)\{2\}.*test42_v2i64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test43_v16i8.*:.*insve.b.*test43_v16i8" } } */
+/* { dg-final { scan-assembler "test43_v8i16.*:.*insve.h.*test43_v8i16" } } */
+/* { dg-final { scan-assembler "test43_v4i32.*:.*insve.w.*test43_v4i32" } } */
+/* { dg-final { scan-assembler "test43_v2i64.*:.*insve.d.*test43_v2i64" } } */
+/* { dg-final { scan-assembler "test44_v16i8.*:.*copy_s.b.*test44_v16i8" } } */
+/* { dg-final { scan-assembler "test44_v8i16.*:.*copy_s.h.*test44_v8i16" } } */
+/* { dg-final { scan-assembler "test44_v4i32.*:.*copy_\(s|u\).w.*test44_v4i32" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test44_v4i32.*:.*copy_s.w.*test44_v4i32" { target mips64 } } } */
+/* Note: insert.d only available on MIPS64, replaced with equivalent on MIPS32.  */
+/* { dg-final { scan-assembler "test44_v2i64.*:.*copy_s.d.*test44_v2i64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test44_v2i64.*:.*\(.*copy_s.w\)\{2\}.*test44_v2i64" { target {! mips64 } } } } */
+/* Note: two outputs are possible for unsigned return types, copy unsigned or
+   copy signed followed by logical AND. For targets where the width of elements
+   is equal to the register size for that target, logical AND is not emitted/needed.  */
+/* { dg-final { scan-assembler "test45_v16u8.*:.*\(copy_u.b|copy_s.b.*andi.*0x\(00\)?ff\).*test45_v16u8" } } */
+/* { dg-final { scan-assembler "test45_v8u16.*:.*\(copy_u.h|copy_s.h.*andi.*0xffff\).*test45_v8u16" } } */
+/* { dg-final { scan-assembler "test45_v4u32.*:.*\(copy_u.w|copy_s.w\).*test45_v4u32" } } */
+/* { dg-final { scan-assembler "test45_v2u64.*:.*\(copy_u.d|copy_s.d\).*test45_v2u64" { target mips64 } } } */
+/* { dg-final { scan-assembler "test45_v2u64.*:.*\(\(copy_u|copy_s\).w.*\)\{2\}.*test45_v2u64" { target {! mips64 } } } } */
+/* { dg-final { scan-assembler "test46_v16i8.*:.*st.b.*test46_v16i8" } } */
+/* { dg-final { scan-assembler "test46_v8i16.*:.*st.h.*test46_v8i16" } } */
+/* { dg-final { scan-assembler "test46_v4i32.*:.*st.w.*test46_v4i32" } } */
+/* { dg-final { scan-assembler "test46_v2i64.*:.*st.d.*test46_v2i64" } } */
+
+typedef signed char v16i8 __attribute__ ((vector_size(16)));
+typedef short v8i16 __attribute__ ((vector_size(16)));
+typedef int v4i32 __attribute__ ((vector_size(16)));
+typedef long long v2i64 __attribute__ ((vector_size(16)));
+typedef unsigned char v16u8 __attribute__ ((vector_size(16)));
+typedef unsigned short v8u16 __attribute__ ((vector_size(16)));
+typedef unsigned int v4u32 __attribute__ ((vector_size(16)));
+typedef unsigned long long v2u64 __attribute__ ((vector_size(16)));
+typedef float v4f32 __attribute__ ((vector_size(16)));
+typedef double v2f64 __attribute__ ((vector_size(16)));
+
+float imm_f = 37.0;
+
+#define v16i8_DF b
+#define v8i16_DF h
+#define v4i32_DF w
+#define v2i64_DF d
+#define v16u8_DF b
+#define v8u16_DF h
+#define v4u32_DF w
+#define v2u64_DF d
+
+#define v16i8_IN int
+#define v8i16_IN int
+#define v4i32_IN int
+#define v2i64_IN long long
+#define v16u8_IN int
+#define v8u16_IN int
+#define v4u32_IN int
+#define v2u64_IN long long
+
+#define v16i8_INITV V16
+#define v8i16_INITV V8
+#define v4i32_INITV V4
+#define v2i64_INITV V2
+#define v16u8_INITV V16
+#define v8u16_INITV V8
+#define v4u32_INITV V4
+#define v2u64_INITV V2
+
+#define v16i8_LDI_MIN -128
+#define v16i8_LDI_MAX 127
+#define v8i16_LDI_MIN -512
+#define v8i16_LDI_MAX 511
+#define v4i32_LDI_MIN -512
+#define v4i32_LDI_MAX 511
+#define v2i64_LDI_MIN -512
+#define v2i64_LDI_MAX 511
+
+#define VE2(VALUE) (VALUE), (VALUE)
+#define VE4(VALUE) VE2(VALUE), VE2(VALUE)
+#define VE8(VALUE) VE4(VALUE), VE4(VALUE)
+#define VE16(VALUE) VE8(VALUE), VE8(VALUE)
+
+#define V16(TYPE, VALUE) (TYPE) { VE16(VALUE) }
+#define V8(TYPE, VALUE) (TYPE) { VE8(VALUE) }
+#define V4(TYPE, VALUE) (TYPE) { VE4(VALUE) }
+#define V2(TYPE, VALUE) (TYPE) { VE2(VALUE) }
+
+#define INIT_VECTOR(TYPE, VALUE) TYPE ## _INITV (TYPE, VALUE)
+
+
+#define DECLARE(TYPE) TYPE TYPE ## _0, TYPE ## _1, TYPE ## _2;
+#define RETURN(TYPE) NOMIPS16 TYPE test0_ ## TYPE () { return TYPE ## _0; }
+#define ASSIGN(TYPE) NOMIPS16 void test1_ ## TYPE (TYPE i) { TYPE ## _1 = i; }
+#define ADD(TYPE) NOMIPS16 TYPE test2_ ## TYPE (TYPE i, TYPE j) { return i + j; }
+#define SUB(TYPE) NOMIPS16 TYPE test3_ ## TYPE (TYPE i, TYPE j) { return i - j; }
+#define MUL(TYPE) NOMIPS16 TYPE test4_ ## TYPE (TYPE i, TYPE j) { return i * j; }
+#define DIV(TYPE) TYPE test5_ ## TYPE (TYPE i, TYPE j) { return i / j; }
+#define MOD(TYPE) TYPE test6_ ## TYPE (TYPE i, TYPE j) { return i % j; }
+#define MINUS(TYPE) TYPE test7_ ## TYPE (TYPE i) { return -i; }
+#define XOR(TYPE) TYPE test8_ ## TYPE (TYPE i, TYPE j) { return i ^ j; }
+#define OR(TYPE) TYPE test9_ ## TYPE (TYPE i, TYPE j) { return i | j; }
+#define AND(TYPE) TYPE test10_ ## TYPE (TYPE i, TYPE j) { return i & j; }
+#define BIT_COMPLEMENT(TYPE) TYPE test11_ ## TYPE (TYPE i) { return ~i; }
+#define SHIFT_RIGHT(TYPE) TYPE test12_ ## TYPE (TYPE i, TYPE j) { return i >> j; }
+#define SHIFT_LEFT(TYPE) TYPE test13_ ## TYPE (TYPE i, TYPE j) { return i << j; }
+#define EQ(TYPE) TYPE test14_ ## TYPE (TYPE i, TYPE j) { return i == j; }
+#define NEQ(TYPE) TYPE test15_ ## TYPE (TYPE i, TYPE j) { return i != j; }
+#define LT(TYPE) TYPE test16_ ## TYPE (TYPE i, TYPE j) { return i < j; }
+#define LEQ(TYPE) TYPE test17_ ## TYPE (TYPE i, TYPE j) { return i <= j; }
+#define GT(TYPE) TYPE test18_ ## TYPE (TYPE i, TYPE j) { return i > j; }
+#define GEQ(TYPE) TYPE test19_ ## TYPE (TYPE i, TYPE j) { return i >= j; }
+
+#define ADD_I(TYPE) TYPE test20_ ## TYPE (TYPE i) { return i + 31; }
+#define SUB_I(TYPE) TYPE test21_ ## TYPE (TYPE i) { return i - 31; }
+#define MUL_I(TYPE) TYPE test22_ ## TYPE (TYPE i) { return i * 37; }
+#define DIV_I(TYPE) TYPE test23_ ## TYPE (TYPE i) { return i / 37; }
+#define MOD_I(TYPE) TYPE test24_ ## TYPE (TYPE i) { return i % 37; }
+#define XOR_I(TYPE) TYPE test25_ ## TYPE (TYPE i) { return i ^ 37; }
+#define OR_I(TYPE) TYPE test26_ ## TYPE (TYPE i) { return i | 37; }
+#define AND_I(TYPE) TYPE test27_ ## TYPE (TYPE i) { return i & 37; }
+#define SHIFT_RIGHT_I(TYPE) TYPE test28_ ## TYPE (TYPE i) { return i >> 3; }
+#define SHIFT_LEFT_I(TYPE) TYPE test29_ ## TYPE (TYPE i) { return i << 3; }
+#define EQ_I(TYPE) TYPE test30_ ## TYPE (TYPE i) { return i == 5; }
+#define LT_S_I(TYPE) TYPE test31_s_ ## TYPE (TYPE i) { return i < 5; }
+#define LT_U_I(TYPE) TYPE test31_u_ ## TYPE (TYPE i) { return i < (unsigned) 5; }
+#define LEQ_S_I(TYPE) TYPE test32_s_ ## TYPE (TYPE i) { return i <= 5; }
+#define LEQ_U_I(TYPE) TYPE test32_u_ ## TYPE (TYPE i) { return i <= (unsigned) 5; }
+
+#define ADD_F(TYPE) TYPE test33_ ## TYPE (TYPE i) { return i + imm_f; }
+#define SUB_F(TYPE) TYPE test34_ ## TYPE (TYPE i) { return i - imm_f; }
+#define MUL_F(TYPE) TYPE test35_ ## TYPE (TYPE i) { return i * imm_f; }
+#define DIV_F(TYPE) TYPE test36_ ## TYPE (TYPE i) { return i / imm_f; }
+
+#define MADD(TYPE) TYPE test37_ ## TYPE (TYPE i, TYPE j, TYPE k) { return i * j + k; }
+#define MSUB(TYPE) TYPE test38_ ## TYPE (TYPE i, TYPE j, TYPE k) { return k - i * j; }
+
+/* MSA Load/Store and Move instructions */
+#define LOAD_V(TYPE) TYPE test39_ ## TYPE (TYPE *i) { return *i; }
+#define LOAD_I_MIN(TYPE) TYPE test40_min_ ## TYPE (TYPE *i) { return INIT_VECTOR(TYPE, TYPE ## _LDI_MIN); }
+#define LOAD_I_MAX(TYPE) TYPE test40_max_ ## TYPE (TYPE *i) { return INIT_VECTOR(TYPE, TYPE ## _LDI_MAX); }
+#define FILL(TYPE) TYPE test41_ ## TYPE (TYPE ## _IN i) { return INIT_VECTOR(TYPE, i); }
+#define INSERT(TYPE) TYPE test42_ ## TYPE (TYPE ## _IN i) { TYPE a = INIT_VECTOR(TYPE, 0); a[1] = i; return a; }
+#define INSVE(TYPE) TYPE test43_ ## TYPE (TYPE i) { TYPE a = INIT_VECTOR(TYPE, 0); a[1] = i[0]; return a; }
+#define COPY_S(TYPE) TYPE ## _IN test44_ ## TYPE (TYPE i) { return i[1]; }
+#define COPY_U(TYPE) TYPE ## _IN test45_ ## TYPE (TYPE i) { return i[1]; }
+#define STORE_V(TYPE) void test46_ ## TYPE (TYPE i) { TYPE ## _0 = i; }
+
+#define ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(FUNC) \
+  FUNC (v16i8) \
+  FUNC (v8i16) \
+  FUNC (v4i32) \
+  FUNC (v2i64)
+
+#define ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(FUNC) \
+  FUNC (v16u8) \
+  FUNC (v8u16) \
+  FUNC (v4u32) \
+  FUNC (v2u64)
+
+#define ITERATE_FOR_ALL_INT_VECTOR_TYPES(FUNC) \
+ ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(FUNC) \
+ ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(FUNC)
+
+#define ITERATE_FOR_ALL_INT_TYPES(FUNC) \
+  ITERATE_FOR_ALL_INT_VECTOR_TYPES(FUNC) \
+
+#define ITERATE_FOR_ALL_REAL_VECTOR_TYPES(FUNC) \
+  FUNC (v4f32) \
+  FUNC (v2f64) \
+
+#define ITERATE_FOR_ALL_REAL_SCALAR_TYPES(FUNC) \
+  FUNC (f64) \
+  FUNC (f32)
+
+#define ITERATE_FOR_ALL_REAL_TYPES(FUNC) \
+  ITERATE_FOR_ALL_REAL_VECTOR_TYPES(FUNC) \
+
+#define ITERATE_FOR_ALL_TYPES(FUNC) \
+  ITERATE_FOR_ALL_INT_TYPES(FUNC) \
+  ITERATE_FOR_ALL_REAL_TYPES(FUNC)
+
+ITERATE_FOR_ALL_TYPES (ADD)
+ITERATE_FOR_ALL_TYPES (SUB)
+ITERATE_FOR_ALL_TYPES (MUL)
+ITERATE_FOR_ALL_TYPES (DIV)
+ITERATE_FOR_ALL_INT_TYPES (MOD)
+ITERATE_FOR_ALL_INT_TYPES (XOR)
+ITERATE_FOR_ALL_INT_TYPES (OR)
+ITERATE_FOR_ALL_INT_TYPES (AND)
+ITERATE_FOR_ALL_INT_TYPES (SHIFT_RIGHT)
+ITERATE_FOR_ALL_INT_TYPES (SHIFT_LEFT)
+ITERATE_FOR_ALL_TYPES (MINUS)
+ITERATE_FOR_ALL_INT_TYPES (BIT_COMPLEMENT)
+ITERATE_FOR_ALL_TYPES (MADD)
+ITERATE_FOR_ALL_TYPES (MSUB)
+
+ITERATE_FOR_ALL_TYPES (DECLARE)
+ITERATE_FOR_ALL_TYPES (RETURN)
+ITERATE_FOR_ALL_TYPES (ASSIGN)
+ITERATE_FOR_ALL_INT_TYPES (ADD_I)
+ITERATE_FOR_ALL_INT_TYPES (SUB_I)
+ITERATE_FOR_ALL_INT_TYPES (MUL_I)
+ITERATE_FOR_ALL_INT_TYPES (DIV_I)
+ITERATE_FOR_ALL_INT_TYPES (MOD_I)
+ITERATE_FOR_ALL_INT_TYPES (XOR_I)
+ITERATE_FOR_ALL_INT_TYPES (OR_I)
+ITERATE_FOR_ALL_INT_TYPES (AND_I)
+ITERATE_FOR_ALL_INT_TYPES (SHIFT_RIGHT_I)
+ITERATE_FOR_ALL_INT_TYPES (SHIFT_LEFT_I)
+ITERATE_FOR_ALL_REAL_TYPES (ADD_F)
+ITERATE_FOR_ALL_REAL_TYPES (SUB_F)
+ITERATE_FOR_ALL_REAL_TYPES (MUL_F)
+ITERATE_FOR_ALL_REAL_TYPES (DIV_F)
+ITERATE_FOR_ALL_TYPES (EQ)
+ITERATE_FOR_ALL_TYPES (EQ_I)
+ITERATE_FOR_ALL_TYPES (NEQ)
+ITERATE_FOR_ALL_TYPES (LT)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(LT_S_I)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(LT_U_I)
+ITERATE_FOR_ALL_TYPES (LEQ)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(LEQ_S_I)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(LEQ_U_I)
+ITERATE_FOR_ALL_TYPES (GT)
+ITERATE_FOR_ALL_TYPES (GEQ)
+
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(LOAD_V)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(LOAD_I_MIN)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(LOAD_I_MAX)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(FILL)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(INSERT)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(INSVE)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(COPY_S)
+ITERATE_FOR_ALL_UNSIGNED_INT_VECTOR_TYPES(COPY_U)
+ITERATE_FOR_ALL_SIGNED_INT_VECTOR_TYPES(STORE_V)
diff --git a/gcc/testsuite/gcc.target/mips/msub-3.c b/gcc/testsuite/gcc.target/mips/msub-3.c
index aedd04302ac..132db857de8 100644
--- a/gcc/testsuite/gcc.target/mips/msub-3.c
+++ b/gcc/testsuite/gcc.target/mips/msub-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* This test requires widening_mul */
-/* { dg-options "isa_rev>=1 -mgp32 -fexpensive-optimizations" } */
+/* { dg-options "(HAS_MADD) -mgp32 -fexpensive-optimizations" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-times "\tmsub\t" 2 } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/msubu-3.c b/gcc/testsuite/gcc.target/mips/msubu-3.c
index 2e936ebe03f..07cb7c714f6 100644
--- a/gcc/testsuite/gcc.target/mips/msubu-3.c
+++ b/gcc/testsuite/gcc.target/mips/msubu-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* This test requires widening_mul */
-/* { dg-options "isa_rev>=1 -mgp32 -fexpensive-optimizations" } */
+/* { dg-options "(HAS_MADD) -mgp32 -fexpensive-optimizations" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-times "\tmsubu\t" 2 } } */
 
diff --git a/gcc/testsuite/gcc.target/mips/mulsize-2.c b/gcc/testsuite/gcc.target/mips/mulsize-2.c
index 4cc2224dff9..7c84bfd9cdd 100644
--- a/gcc/testsuite/gcc.target/mips/mulsize-2.c
+++ b/gcc/testsuite/gcc.target/mips/mulsize-2.c
@@ -1,3 +1,4 @@
+/* { dg-options "(!HAS_LSA)" } */
 /* { dg-final { scan-assembler "\t.globl\tf9" } } */
 /* { dg-final { scan-assembler "\tsll\t" } } */
 /* { dg-final { scan-assembler "\taddu\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/mulsize-4.c b/gcc/testsuite/gcc.target/mips/mulsize-4.c
index 7694d2c03dc..f8a94a9588e 100644
--- a/gcc/testsuite/gcc.target/mips/mulsize-4.c
+++ b/gcc/testsuite/gcc.target/mips/mulsize-4.c
@@ -1,3 +1,4 @@
+/* { dg-options "(!HAS_LSA)" } */
 /* { dg-final { scan-assembler "\t.globl\tf17" } } */
 /* { dg-final { scan-assembler "\tsll\t" } } */
 /* { dg-final { scan-assembler "\taddu\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/mulsize-5.c b/gcc/testsuite/gcc.target/mips/mulsize-5.c
new file mode 100644
index 00000000000..1c39a7e3f91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/mulsize-5.c
@@ -0,0 +1,13 @@
+/* { dg-options "(HAS_LSA)" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-final { scan-assembler "\t.globl\tf9" } } */
+/* { dg-final { scan-assembler "\tlsa\t" } } */
+/* { dg-final { scan-assembler-not "\tsll\t" } } */
+/* { dg-final { scan-assembler-not "\taddu\t" } } */
+/* { dg-final { scan-assembler-not "\tli\t" } } */
+/* { dg-final { scan-assembler-not "\tmul\t" } } */
+int
+f9(int x)
+{
+  return x * 9;
+}
diff --git a/gcc/testsuite/gcc.target/mips/mulsize-6.c b/gcc/testsuite/gcc.target/mips/mulsize-6.c
new file mode 100644
index 00000000000..6e9ca003fc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/mulsize-6.c
@@ -0,0 +1,13 @@
+/* { dg-options "(HAS_LSA)" } */
+/* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
+/* { dg-final { scan-assembler "\t.globl\tf17" } } */
+/* { dg-final { scan-assembler "\tlsa\t" } } */
+/* { dg-final { scan-assembler-not "\tsll\t" } } */
+/* { dg-final { scan-assembler-not "\taddu\t" } } */
+/* { dg-final { scan-assembler-not "\tli\t" } } */
+/* { dg-final { scan-assembler-not "\tmul\t" } } */
+int
+f17(int x)
+{
+  return x * 17;
+}
diff --git a/gcc/testsuite/gcc.target/mips/mult-1.c b/gcc/testsuite/gcc.target/mips/mult-1.c
index 1038797f228..bd9757cc0ed 100644
--- a/gcc/testsuite/gcc.target/mips/mult-1.c
+++ b/gcc/testsuite/gcc.target/mips/mult-1.c
@@ -1,7 +1,7 @@
 /* For SI->DI widening multiplication we should use DINS to combine the two
    halves.  For Octeon use DMUL with explicit widening.  */
 /* This test requires widening_mul */
-/* { dg-options "-mgp64 isa_rev>=2 forbid_cpu=octeon.* -fexpensive-optimizations" } */
+/* { dg-options "-mgp64 (HAS_INS) (NOT_HAS_DMUL) -fexpensive-optimizations" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler "\tdins\t" } } */
 /* { dg-final { scan-assembler-not "\tdsll\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/near-far-1.c b/gcc/testsuite/gcc.target/mips/near-far-1.c
index ac0cc1ef79b..3562b860358 100644
--- a/gcc/testsuite/gcc.target/mips/near-far-1.c
+++ b/gcc/testsuite/gcc.target/mips/near-far-1.c
@@ -16,5 +16,5 @@ int test ()
 
 /* { dg-final { scan-assembler-not "\tjal\tlong_call_func\n" } } */
 /* { dg-final { scan-assembler-not "\tjal\tfar_func\n" } } */
-/* { dg-final { scan-assembler     "\tjal\tnear_func\n" } } */
+/* { dg-final { scan-assembler     "\t(jal|balc)\tnear_func\n" } } */
 /* { dg-final { scan-assembler-not "\tjal\tnormal_func\n" } } */
diff --git a/gcc/testsuite/gcc.target/mips/near-far-2.c b/gcc/testsuite/gcc.target/mips/near-far-2.c
index c954b444cb0..ca129f121b3 100644
--- a/gcc/testsuite/gcc.target/mips/near-far-2.c
+++ b/gcc/testsuite/gcc.target/mips/near-far-2.c
@@ -16,5 +16,5 @@ int test ()
 
 /* { dg-final { scan-assembler-not "\tjal\tlong_call_func\n" } } */
 /* { dg-final { scan-assembler-not "\tjal\tfar_func\n" } } */
-/* { dg-final { scan-assembler     "\tjal\tnear_func\n" } } */
-/* { dg-final { scan-assembler     "\tjal\tnormal_func\n" } } */
+/* { dg-final { scan-assembler     "\t(jal|balc)\tnear_func\n" } } */
+/* { dg-final { scan-assembler     "\t(jal|balc)\tnormal_func\n" } } */
diff --git a/gcc/testsuite/gcc.target/mips/near-far-3.c b/gcc/testsuite/gcc.target/mips/near-far-3.c
index d4d48b1ed53..9edbff276a1 100644
--- a/gcc/testsuite/gcc.target/mips/near-far-3.c
+++ b/gcc/testsuite/gcc.target/mips/near-far-3.c
@@ -13,5 +13,5 @@ NOMIPS16 int test4 () { return normal_func (); }
 
 /* { dg-final { scan-assembler-not "\tj\tlong_call_func\n" } } */
 /* { dg-final { scan-assembler-not "\tj\tfar_func\n" } } */
-/* { dg-final { scan-assembler     "\tj(|al)\tnear_func\n" } } */
+/* { dg-final { scan-assembler     "\t(j|b)(|al)c?\tnear_func\n" } } */
 /* { dg-final { scan-assembler-not "\tj\tnormal_func\n" } } */
diff --git a/gcc/testsuite/gcc.target/mips/near-far-4.c b/gcc/testsuite/gcc.target/mips/near-far-4.c
index 0ea07b06205..69f5d9466c2 100644
--- a/gcc/testsuite/gcc.target/mips/near-far-4.c
+++ b/gcc/testsuite/gcc.target/mips/near-far-4.c
@@ -13,5 +13,5 @@ NOMIPS16 int test4 () { return normal_func (); }
 
 /* { dg-final { scan-assembler-not "\tj\tlong_call_func\n" } } */
 /* { dg-final { scan-assembler-not "\tj\tfar_func\n" } } */
-/* { dg-final { scan-assembler     "\tj(|al)\tnear_func\n" } } */
-/* { dg-final { scan-assembler     "\tj(|al)\tnormal_func\n" } } */
+/* { dg-final { scan-assembler     "\t(j|b)(|al)c?\tnear_func\n" } } */
+/* { dg-final { scan-assembler     "\t(j|b)(|al)c?\tnormal_func\n" } } */
diff --git a/gcc/testsuite/gcc.target/mips/neg-abs-2.c b/gcc/testsuite/gcc.target/mips/neg-abs-2.c
index 435751e0cc4..59e797def1a 100644
--- a/gcc/testsuite/gcc.target/mips/neg-abs-2.c
+++ b/gcc/testsuite/gcc.target/mips/neg-abs-2.c
@@ -1,7 +1,7 @@
 /* Make sure that we avoid abs.fmt and neg.fmt when the signs of NaNs
    matter.  */
 /* { dg-do compile } */
-/* { dg-options "-mhard-float -fno-finite-math-only" } */
+/* { dg-options "isa_rev<=5 -mhard-float -fno-finite-math-only -mabs=legacy" } */
 /* { dg-final { scan-assembler-not "\tneg.s\t" } } */
 /* { dg-final { scan-assembler-not "\tneg.d\t" } } */
 /* { dg-final { scan-assembler-not "\tabs.s\t" } } */
diff --git a/gcc/testsuite/gcc.target/mips/octeon-bbit-3.c b/gcc/testsuite/gcc.target/mips/octeon-bbit-3.c
index 7b73f43a1f4..edb0c385030 100644
--- a/gcc/testsuite/gcc.target/mips/octeon-bbit-3.c
+++ b/gcc/testsuite/gcc.target/mips/octeon-bbit-3.c
@@ -31,6 +31,8 @@ typedef struct bitfield_s {
   ulong64 f:18;
 } bitfield_t;
 
+void foo (bitfield_t*);
+
 bitfield_t bar;
 
 NOMIPS16 void
diff --git a/gcc/testsuite/gcc.target/mips/octeon-seq-4.c b/gcc/testsuite/gcc.target/mips/octeon-seq-4.c
index 0fd83f0b309..3f2082221d7 100644
--- a/gcc/testsuite/gcc.target/mips/octeon-seq-4.c
+++ b/gcc/testsuite/gcc.target/mips/octeon-seq-4.c
@@ -6,6 +6,8 @@
 unsigned
 m (unsigned e);
 
+extern void h ();
+
 NOMIPS16 void
 f (unsigned i)
 {
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-1.c b/gcc/testsuite/gcc.target/mips/oddspreg-1.c
new file mode 100644
index 00000000000..a9c69573693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-1.c
@@ -0,0 +1,13 @@
+/* Check that we enable odd-numbered single precision registers.  */
+/* { dg-options "-mabi=32 -modd-spreg -mhard-float" } */
+
+#if _MIPS_SPFPSET != 32
+#error "Incorrect number of single-precision registers reported"
+#endif
+
+void
+foo ()
+{
+  register float foo asm ("$f1");
+  asm volatile ("" : "=f" (foo));
+}
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-2.c b/gcc/testsuite/gcc.target/mips/oddspreg-2.c
new file mode 100644
index 00000000000..e2e0a2660bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-2.c
@@ -0,0 +1,10 @@
+/* Check that we disable odd-numbered single precision registers.  */
+/* { dg-skip-if "needs asm output" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+/* { dg-options "-mabi=32 -mno-odd-spreg -mhard-float" } */
+
+void
+foo ()
+{
+  register float foo asm ("$f1"); /* { dg-error "isn't suitable for" } */
+  asm volatile ("" : "=f" (foo));
+}
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-3.c b/gcc/testsuite/gcc.target/mips/oddspreg-3.c
new file mode 100644
index 00000000000..f287eb66e92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-3.c
@@ -0,0 +1,10 @@
+/* Check that we disable odd-numbered single precision registers.  */
+/* { dg-skip-if "needs asm output" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+/* { dg-options "-mabi=32 -mfp32 -march=loongson3a -mhard-float" } */
+
+void
+foo ()
+{
+  register float foo asm ("$f1"); /* { dg-error "isn't suitable for" } */
+  asm volatile ("" : "=f" (foo));
+}
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-4.c b/gcc/testsuite/gcc.target/mips/oddspreg-4.c
new file mode 100644
index 00000000000..723424a39bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-4.c
@@ -0,0 +1,15 @@
+/* Check that we disable odd-numbered single precision registers and can
+   still generate code.  */
+/* { dg-options "-mabi=32 -mno-odd-spreg -mhard-float" } */
+
+#if _MIPS_SPFPSET != 16
+#error "Incorrect number of single-precision registers reported"
+#endif
+
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-5.c b/gcc/testsuite/gcc.target/mips/oddspreg-5.c
new file mode 100644
index 00000000000..8d7d884580c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-5.c
@@ -0,0 +1,11 @@
+/* Check that -mno-odd-spreg is not supported with -mabi=64.  */
+/* { dg-options "-mabi=64 -mno-odd-spreg -mhard-float" } */
+/* { dg-error "unsupported combination" "" { target *-*-* } 0 } */
+
+float a;
+float
+foo ()
+{
+  float b = a + 1.0f;
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/mips/oddspreg-6.c b/gcc/testsuite/gcc.target/mips/oddspreg-6.c
new file mode 100644
index 00000000000..955dea90140
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/oddspreg-6.c
@@ -0,0 +1,10 @@
+/* Check that we disable odd-numbered single precision registers for FPXX.  */
+/* { dg-skip-if "needs asm output" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+/* { dg-options "-mabi=32 -mfpxx -mhard-float" } */
+
+void
+foo ()
+{
+  register float foo asm ("$f1"); /* { dg-error "isn't suitable for" } */
+  asm volatile ("" : "=f" (foo));
+}
diff --git a/gcc/testsuite/gcc.target/mips/pr37362.c b/gcc/testsuite/gcc.target/mips/pr37362.c
index 848d879d325..2ad4e8dbc17 100644
--- a/gcc/testsuite/gcc.target/mips/pr37362.c
+++ b/gcc/testsuite/gcc.target/mips/pr37362.c
@@ -1,5 +1,5 @@
 /* mips*-sde-elf doesn't have 128-bit long doubles.  */
-/* { dg-do compile { target { ! { mips*-sde-elf mips*-mti-elf } } } } */
+/* { dg-do compile { target { ! { mips*-sde-elf mips*-mti-elf mips*-img-elf } } } } */
 /* { dg-options "-march=mips64r2 -mabi=n32" } */
 
 typedef float TFtype __attribute__((mode(TF)));
diff --git a/gcc/testsuite/gcc.target/mips/timode-1.c b/gcc/testsuite/gcc.target/mips/timode-1.c
index 606fee0cb1a..be3d317cb69 100644
--- a/gcc/testsuite/gcc.target/mips/timode-1.c
+++ b/gcc/testsuite/gcc.target/mips/timode-1.c
@@ -1,4 +1,4 @@
-/* { dg-options "-mgp64" } */
+/* { dg-options "isa_rev<=5 -mgp64" } */
 /* { dg-skip-if "we deliberately use calls when optimizing for size" { *-*-* } { "-Os" } { "" } } */
 typedef int int128_t __attribute__((mode(TI)));
 typedef unsigned int uint128_t __attribute__((mode(TI)));
diff --git a/gcc/testsuite/gcc.target/mips/truncate-3.c b/gcc/testsuite/gcc.target/mips/truncate-3.c
index fcb69e4b44f..988e46eb78d 100644
--- a/gcc/testsuite/gcc.target/mips/truncate-3.c
+++ b/gcc/testsuite/gcc.target/mips/truncate-3.c
@@ -3,6 +3,9 @@
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-not "\tandi?\t" } } */
 
+extern void g (int);
+
+int
 f (long long d)
 {
   long long c = d & 0xffffffffff;
diff --git a/gcc/testsuite/gcc.target/mips/umips-lwp-1.c b/gcc/testsuite/gcc.target/mips/umips-lwp-1.c
index 0cdb1b7f2bc..8354bf7c43f 100644
--- a/gcc/testsuite/gcc.target/mips/umips-lwp-1.c
+++ b/gcc/testsuite/gcc.target/mips/umips-lwp-1.c
@@ -6,7 +6,7 @@ foo (int *r4)
 {
   int r5 = r4[0];
   int r6 = r4[1];
-  r4[2] = r5 * r5;
+  r4[2] = (r5 << 1) + r6;
   {
     register int r5asm asm ("$5") = r5;
     register int r6asm asm ("$6") = r6;
diff --git a/gcc/testsuite/gcc.target/mips/umips-lwp-2.c b/gcc/testsuite/gcc.target/mips/umips-lwp-2.c
index ea3f3960742..6622cf1402a 100644
--- a/gcc/testsuite/gcc.target/mips/umips-lwp-2.c
+++ b/gcc/testsuite/gcc.target/mips/umips-lwp-2.c
@@ -6,7 +6,7 @@ foo (int *r4)
 {
   int r5 = r4[0];
   int r6 = r4[1];
-  r4[2] = r6 * r6;
+  r4[2] = (r6 << 1) + r5;
   {
     register int r5asm asm ("$5") = r5;
     register int r6asm asm ("$6") = r6;
diff --git a/gcc/testsuite/gcc.target/mips/umips-lwp-3.c b/gcc/testsuite/gcc.target/mips/umips-lwp-3.c
index 2cb37510feb..46c51e3d0c6 100644
--- a/gcc/testsuite/gcc.target/mips/umips-lwp-3.c
+++ b/gcc/testsuite/gcc.target/mips/umips-lwp-3.c
@@ -6,7 +6,7 @@ foo (int *r4)
 {
   int r5 = r4[511];
   int r6 = r4[512];
-  r4[2] = r5 * r5;
+  r4[2] = (r5 << 1) + r6;
   {
     register int r5asm asm ("$5") = r5;
     register int r6asm asm ("$6") = r6;
diff --git a/gcc/testsuite/gcc.target/mips/umips-lwp-4.c b/gcc/testsuite/gcc.target/mips/umips-lwp-4.c
index b8a86b4ed90..dd107ad8952 100644
--- a/gcc/testsuite/gcc.target/mips/umips-lwp-4.c
+++ b/gcc/testsuite/gcc.target/mips/umips-lwp-4.c
@@ -6,7 +6,7 @@ foo (int *r4)
 {
   int r5 = r4[511];
   int r6 = r4[512];
-  r4[2] = r6 * r6;
+  r4[2] = (r6 << 1) + r5;
   {
     register int r5asm asm ("$5") = r5;
     register int r6asm asm ("$6") = r6;
diff --git a/gcc/testsuite/gcc.target/mips/umips-store16-1.c b/gcc/testsuite/gcc.target/mips/umips-store16-1.c
new file mode 100644
index 00000000000..6377e8569d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/umips-store16-1.c
@@ -0,0 +1,30 @@
+/* { dg-options "(-mmicromips)" } */
+/* { dg-do assemble } */
+
+register unsigned int global asm ("$16");
+
+extern void exit (int) __attribute__((noreturn));
+
+MICROMIPS void
+test_sb (unsigned char *ptr, void (*f) (void))
+{
+  ptr[0] = global;
+  f ();
+  exit (0);
+}
+
+MICROMIPS void
+test_sh (unsigned short *ptr, void (*f) (void))
+{
+  ptr[0] = global;
+  f ();
+  exit (0);
+}
+
+MICROMIPS void
+test_sw (unsigned int *ptr, void (*f) (void))
+{
+  ptr[0] = global;
+  f ();
+  exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/mips/umips-store16-2.c b/gcc/testsuite/gcc.target/mips/umips-store16-2.c
new file mode 100644
index 00000000000..0748edb5692
--- /dev/null
+++ b/gcc/testsuite/gcc.target/mips/umips-store16-2.c
@@ -0,0 +1,22 @@
+/* { dg-options "(-mmicromips) -dp" } */
+
+MICROMIPS void
+f1 (unsigned char *ptr)
+{
+  *ptr = 0;
+}
+
+MICROMIPS void
+f2 (unsigned short *ptr)
+{
+  *ptr = 0;
+}
+
+MICROMIPS void
+f3 (unsigned int *ptr)
+{
+  *ptr = 0;
+}
+/* { dg-final { scan-assembler "\tsb\t\\\$0,0\\(\\\$\[0-9\]+\\)\[^\n\]*length = 2" } } */
+/* { dg-final { scan-assembler "\tsh\t\\\$0,0\\(\\\$\[0-9\]+\\)\[^\n\]*length = 2" } } */
+/* { dg-final { scan-assembler "\tsw\t\\\$0,0\\(\\\$\[0-9\]+\\)\[^\n\]*length = 2" } } */
diff --git a/gcc/testsuite/gcc.target/mips/unaligned-1.c b/gcc/testsuite/gcc.target/mips/unaligned-1.c
index 938f52d21f2..4888ca8b51f 100644
--- a/gcc/testsuite/gcc.target/mips/unaligned-1.c
+++ b/gcc/testsuite/gcc.target/mips/unaligned-1.c
@@ -1,4 +1,4 @@
-/* { dg-options "-mgp64" } */
+/* { dg-options "isa_rev<=5 -mgp64" } */
 /* { dg-skip-if "code quality test" { *-*-* } { "-O0" } { "" } } */
 /* { dg-final { scan-assembler-times "\tsdl\t" 1 } } */
 /* { dg-final { scan-assembler-times "\tsdr\t" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
new file mode 100644
index 00000000000..ab85e9160a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort();
+
+#define N 16
+
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[] __attribute__((aligned(16)))
+  = {8, 7, 6, 5, 4, 3, 2,  1,  0, -1, -2, -3, -4, -5, -6, -7};
+signed char cc[] __attribute__((aligned(16)))
+  = {1, 1, 2, 2, 3, 3, 2,  2,  1,  1,  0,  0, -1, -1, -2, -2};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+int main ()
+{
+  signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5};
+  int i;
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != cd[i])
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
new file mode 100644
index 00000000000..170649df608
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-10.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i % 2 ? 1 : -1;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
+      abort ();
+    else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
new file mode 100644
index 00000000000..699b5baf404
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-11.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+int hey;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector int va, vb, vc, vd, tmp;
+  vector unsigned int threes = vec_splat_u32(3);
+  for (i = 0; i < N; i+=4) {
+    vb = vec_vsx_ld (0, &cb[i]);
+    vc = vec_vsx_ld (0, &cc[i]);
+    vd = vec_vsx_ld (0, &cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    hey = tmp[3];
+    vec_vsx_st (tmp, 0, &ca[i]);
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i + 14;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != (-3 * i - 1969) >> 3)
+      abort ();
+  if (hey != ca[N-1])
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
new file mode 100644
index 00000000000..529d03e64c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-12.c
@@ -0,0 +1,56 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include "altivec.h"
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+int hey;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector int va, vb, vc, vd, tmp;
+  vector unsigned int threes = vec_splat_u32(3);
+  for (i = 0; i < N; i+=4) {
+    vb = vec_vsx_ld (0, &cb[i]);
+    vc = vec_vsx_ld (0, &cc[i]);
+    vd = vec_vsx_ld (0, &cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    hey = tmp[3];
+    vec_vsx_st (tmp, 0, &ca[i]);
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i + 14;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != (-3 * i - 1969) >> 3)
+      abort ();
+  if (hey != ca[N-1])
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
new file mode 100644
index 00000000000..787b02e6427
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-13.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i + 14;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != (-3 * i - 1969) >> 3)
+      abort ();
+  if (x != ca[N-1])
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
new file mode 100644
index 00000000000..7ca6ad5ccaf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-14.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "stxsdx" } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 } } */
+
+/* The only xxpermdi expected is for the vec_splats.  */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector long long va, vb, vc, vd, tmp;
+  volatile unsigned long long three = 3;
+  vector unsigned long long threes = vec_splats (three);
+  for (i = 0; i < N; i+=2) {
+    vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+    vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+    vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+    tmp = vec_add (vb, vc);
+    tmp = vec_sub (tmp, vd);
+    tmp = vec_sra (tmp, threes);
+    x = vec_extract (tmp, 0);
+    vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
new file mode 100644
index 00000000000..172e4bd4cb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-15.c
@@ -0,0 +1,51 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "xxspltw" } } */
+
+/* Currently the analyze_swaps phase cannot optimize this loop because
+   of the presence of an UNSPEC_VSX_CVDPSPN.  At such time as this is 
+   handled, we need to add a 'scan-assembler-not "xxpermdi"' directive to
+   this test.  */
+#include <altivec.h>
+void abort();
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
+
+    cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
+    
+    cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
+    
+    cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c
new file mode 100644
index 00000000000..2b7f73c3715
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-16.c
@@ -0,0 +1,57 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "vspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  vector float brow;
+
+  for (i = 0; i < N; i++) {
+
+    brow = cb[i][0];
+    cc[i][0] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][0] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+
+    brow = cb[i][1];
+    cc[i][1] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][1] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+    
+    brow = cb[i][2];
+    cc[i][2] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][2] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+    
+    brow = cb[i][3];
+    cc[i][3] = vec_mul(vec_splats(brow[0]), ca[i][0]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[1]), ca[i][1]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[2]), ca[i][2]);
+    cc[i][3] = vec_madd(cc[i][0],vec_splats(brow[3]), ca[i][3]);
+  }
+}
+
+int main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
new file mode 100644
index 00000000000..7a9cfbf954e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-17.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O1" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "xxpermdi" } } */
+
+/* Verify that we don't try to do permute removal in the presence of
+   vec_ste.  This used to ICE.  */
+#include <altivec.h>
+
+void f (void *p)
+{
+  vector unsigned int u32 = vec_vsx_ld (1, (const unsigned int *)p);
+  vec_ste (u32, 1, (unsigned int *)p);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c
new file mode 100644
index 00000000000..6ce041ab519
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 256
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = i - 128;
+    cc[i] = i/2 - 64;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != i - i/2 - 64)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
new file mode 100644
index 00000000000..35dacd4b578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i, ii;
+  for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
+    cb[i] = ii - 128;
+    cc[i] = ii/2 - 64;
+  }
+}
+
+int main ()
+{
+  int i, ii;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i) {
+    ii = i % 128;
+    if (ca[i] != ii - ii/2 - 64)
+      abort ();
+  }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
new file mode 100644
index 00000000000..61fe99b357b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-4.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = (cb[i] + cc[i]) * cd[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i % 2 ? 1 : -1;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (i % 2 == 1 && ca[i] != -2 * i - 1955)
+      abort ();
+    else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
new file mode 100644
index 00000000000..b367fb6b514
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-5.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = ((cb[i] + cc[i]) * cd[i]) >> 3;
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i % 2 ? 1 : -1;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (i % 2 == 1 && ca[i] != (-2 * i - 1955) >> 3)
+      abort ();
+    else if (i % 2 == 0 && ca[i] != (1955 + 2 * i) >> 3)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
new file mode 100644
index 00000000000..f7084529ce8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-6.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort();
+
+#define N 16
+
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[] __attribute__((aligned(16)))
+  = {8, 7, 6, 5, 4, 3, 2,  1,  0, -1, -2, -3, -4, -5, -6, -7};
+signed char cc[] __attribute__((aligned(16)))
+  = {1, 1, 2, 2, 3, 3, 2,  2,  1,  1,  0,  0, -1, -1, -2, -2};
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+int main ()
+{
+  signed char cd[] = {7, 6, 4, 3, 1, 0, 0, -1, -1, -2, -2, -3, -3, -4, -4, -5};
+  int i;
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != cd[i])
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
new file mode 100644
index 00000000000..27a31b711ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-7.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 256
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = i - 128;
+    cc[i] = i/2 - 64;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (ca[i] != i - i/2 - 64)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
new file mode 100644
index 00000000000..7264d2586b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-8.c
@@ -0,0 +1,40 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+signed char ca[N] __attribute__((aligned(16)));
+signed char cb[N] __attribute__((aligned(16)));
+signed char cc[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = cb[i] - cc[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i, ii;
+  for (i = 0, ii = 0; i < N; ++i, ii = (ii + 1) % 128) {
+    cb[i] = ii - 128;
+    cc[i] = ii/2 - 64;
+  }
+}
+
+int main ()
+{
+  int i, ii;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i) {
+    ii = i % 128;
+    if (ca[i] != ii - ii/2 - 64)
+      abort ();
+  }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c b/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
new file mode 100644
index 00000000000..cdca070e3d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/swaps-p8-9.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+void abort ();
+
+#define N 4096
+int ca[N] __attribute__((aligned(16)));
+int cb[N] __attribute__((aligned(16)));
+int cc[N] __attribute__((aligned(16)));
+int cd[N] __attribute__((aligned(16)));
+
+__attribute__((noinline)) void foo ()
+{
+  int i;
+  for (i = 0; i < N; i++) {
+    ca[i] = (cb[i] + cc[i]) * cd[i];
+  }
+}
+
+__attribute__((noinline)) void init ()
+{
+  int i;
+  for (i = 0; i < N; ++i) {
+    cb[i] = 3 * i - 2048;
+    cc[i] = -5 * i + 93;
+    cd[i] = i % 2 ? 1 : -1;
+  }
+}
+
+int main ()
+{
+  int i;
+  init ();
+  foo ();
+  for (i = 0; i < N; ++i)
+    if (i % 2 == 1 && ca[i] != -2 * i - 1955)
+      abort ();
+    else if (i % 2 == 0 && ca[i] != 1955 + 2 * i)
+      abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 0f47c7f622f..8995fb0b378 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -955,6 +955,17 @@ proc check_effective_target_nomips16 { } {
     }]
 }
 
+# Return true if the target is a MIPS target that does not produce
+# micromips code.
+
+proc check_effective_target_nomicromips { } {
+    return [check_no_compiler_messages nomicromips object {
+	#ifdef __mips_micromips
+	#error MICROMIPS
+	#endif
+    }]
+}
+
 # Add the options needed for MIPS16 function attributes.  At the moment,
 # we don't support MIPS16 PIC.
 
@@ -1324,6 +1335,42 @@ proc check_sse_hw_available { } {
     }]
 }
 
+# Return 1 if the target supports executing MSA instructions, 0
+# otherwise.  Cache the result.
+
+proc check_msa_hw_available { } {
+    return [check_cached_effective_target msa_hw_available {
+	# If this is not the right target then we can skip the test.
+	if { !([istarget mips*-*-*]) } {
+	    expr 0
+	} else {
+	    check_runtime_nocache msa_hw_available {
+	      #if !defined(__mips_msa)
+	      #error "MSA NOT AVAIL"
+	      #else
+	      #if !(((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2))
+	      #error "MSA NOT AVAIL FOR ISA REV < 2"
+	      #endif
+	      #if !defined(__mips_hard_float)
+	      #error "MSA HARD_FLOAT REQUIRED"
+	      #endif
+	      #if __mips_fpr != 64
+	      #error "MSA 64 FPR REQUIRED"
+	      #endif
+	      #include <msa.h>
+
+	      int main()
+	      {
+	        v8i16 v = __builtin_msa_ldi_h (0);
+	        v[0] = 0;
+	        return v[0];
+	      }
+	      #endif
+	    } "-mmsa"
+	}
+    }]
+}
+
 # Return 1 if the target supports executing SSE2 instructions, 0
 # otherwise.  Cache the result.
 
@@ -1393,6 +1440,24 @@ proc check_effective_target_sse2_runtime { } {
     return 0
 }
 
+# Return 1 if the target supports running SSE2 executables, 0 otherwise.
+
+proc check_effective_target_msa_runtime { } {
+    if { [check_effective_target_mips_msa]
+	 && [check_msa_hw_available] } {
+	return 1
+    }
+    return 0
+}
+
+# Return 1 if msa and not mips16 and not micropmips
+
+proc check_effective_target_mips_msa_nomips16_nomicromips { } {
+  return [check_effective_target_mips_msa]
+      && [check_effective_target_nomip16]
+      && [check_eefective_target_nomicromips]
+}
+
 # Return 1 if the target supports running AVX executables, 0 otherwise.
 
 proc check_effective_target_avx_runtime { } {
@@ -2122,7 +2187,8 @@ proc check_effective_target_vect_int { } {
 	      || [istarget aarch64*-*-*]
 	      || [check_effective_target_arm32]
 	      || ([istarget mips*-*-*]
-		  && [check_effective_target_mips_loongson]) } {
+		  && ([check_effective_target_mips_msa_nomips16_nomicromips]
+		      || [check_effective_target_mips_loongson])) } {
 	   set et_vect_int_saved 1
 	}
     }
@@ -2146,7 +2212,9 @@ proc check_effective_target_vect_intfloat_cvt { } {
                    && ![istarget powerpc-*-linux*paired*])
               || [istarget x86_64-*-*] 
               || ([istarget arm*-*-*]
-                  && [check_effective_target_arm_neon_ok])} {
+                  && [check_effective_target_arm_neon_ok])
+	      || ([istarget mips*-*-*]
+	          && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
            set et_vect_intfloat_cvt_saved 1
         }
     }
@@ -2185,7 +2253,9 @@ proc check_effective_target_vect_uintfloat_cvt { } {
 	      || [istarget x86_64-*-*] 
 	      || [istarget aarch64*-*-*]
 	      || ([istarget arm*-*-*]
-		  && [check_effective_target_arm_neon_ok])} {
+		  && [check_effective_target_arm_neon_ok])
+	      || ([istarget mips*-*-*]
+	          && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
            set et_vect_uintfloat_cvt_saved 1
         }
     }
@@ -2210,7 +2280,9 @@ proc check_effective_target_vect_floatint_cvt { } {
                    && ![istarget powerpc-*-linux*paired*])
               || [istarget x86_64-*-*]
               || ([istarget arm*-*-*]
-                  && [check_effective_target_arm_neon_ok])} {
+                  && [check_effective_target_arm_neon_ok])
+	      || ([istarget mips*-*-*]
+	          && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
            set et_vect_floatint_cvt_saved 1
         }
     }
@@ -2232,7 +2304,9 @@ proc check_effective_target_vect_floatuint_cvt { } {
         if { ([istarget powerpc*-*-*]
 	      && ![istarget powerpc-*-linux*paired*])
 	    || ([istarget arm*-*-*]
-	        && [check_effective_target_arm_neon_ok])} {
+	        && [check_effective_target_arm_neon_ok])
+	    || ([istarget mips*-*-*]
+	        && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
            set et_vect_floatuint_cvt_saved 1
         }
     }
@@ -2946,6 +3020,33 @@ proc check_effective_target_mips_loongson { } {
     }]
 }
 
+# Return 1 if a msa program can be compiled to object
+proc check_effective_target_mips_msa { } {
+  return [check_no_compiler_messages msa object {
+     #if !defined(__mips_msa)
+     #error "MSA NOT AVAIL"
+     #else
+     #if !(((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2))
+     #error "MSA NOT AVAIL FOR ISA REV < 2"
+     #endif
+     #if !defined(__mips_hard_float)
+     #error "MSA HARD_FLOAT REQUIRED"
+     #endif
+     #if __mips_fpr != 64
+     #error "MSA 64 FPR REQUIRED"
+     #endif
+     #include <msa.h>
+
+     int main()
+     {
+	v8i16 v = __builtin_msa_ldi_h (1);
+
+	return v[0];
+     }
+     #endif
+  } "-mmsa" ]
+}
+
 # Return 1 if this is an ARM target that adheres to the ABI for the ARM
 # Architecture.
 
@@ -3339,7 +3440,8 @@ proc check_effective_target_vect_shift { } {
 	     || [istarget aarch64*-*-*]
 	     || [check_effective_target_arm32]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mips_loongson]) } {
+		&& ([check_effective_target_mips_msa_nomips16_nomicromips]
+		   || [check_effective_target_mips_loongson])) } {
 	   set et_vect_shift_saved 1
 	}
     }
@@ -3359,7 +3461,9 @@ proc check_effective_target_vect_shift_char { } {
 	set et_vect_shift_char_saved 0
 	if { ([istarget powerpc*-*-*]
              && ![istarget powerpc-*-linux*paired*])
-	     || [check_effective_target_arm32] } {
+	     || [check_effective_target_arm32]
+	     || ([istarget mips*-*-*]
+		 && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
 	   set et_vect_shift_char_saved 1
 	}
     }
@@ -3379,7 +3483,9 @@ proc check_effective_target_vect_long { } {
               && [check_effective_target_ilp32])
 	 || [istarget x86_64-*-*]
 	 || [check_effective_target_arm32]
-	 || ([istarget sparc*-*-*] && [check_effective_target_ilp32]) } {
+	 || ([istarget sparc*-*-*] && [check_effective_target_ilp32])
+	 || ([istarget mips*-*-*]
+	     && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
 	set answer 1
     } else {
 	set answer 0
@@ -3408,6 +3514,8 @@ proc check_effective_target_vect_float { } {
 	      || [istarget x86_64-*-*]
 	      || [istarget ia64-*-*]
 	      || [istarget aarch64*-*-*]
+	      || ([istarget mips*-*-*]
+		  && [check_effective_target_mips_msa_nomips16_nomicromips])
 	      || [check_effective_target_arm32] } {
 	   set et_vect_float_saved 1
 	}
@@ -3442,6 +3550,9 @@ proc check_effective_target_vect_double { } {
 	    }
 	} elseif { [istarget spu-*-*] } {
 	   set et_vect_double_saved 1
+	} elseif { [istarget mips*-*-*]
+		   && [check_effective_target_mips_msa_nomips16_nomicromips] } {
+	   set et_vect_double_saved 1
 	}
     }
 
@@ -3461,7 +3572,9 @@ proc check_effective_target_vect_long_long { } {
     } else {
         set et_vect_long_long_saved 0
         if { [istarget i?86-*-*]
-              || [istarget x86_64-*-*] } {
+              || [istarget x86_64-*-*]
+	      || ([istarget mips*-*-*]
+	          && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
            set et_vect_long_long_saved 1
         }
     }
@@ -3487,7 +3600,8 @@ proc check_effective_target_vect_no_int_max { } {
 	     || [istarget spu-*-*]
 	     || [istarget alpha*-*-*]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mips_loongson]) } {
+		 && (![check_effective_target_mips_msa_nomips16_nomicromips])
+		 &&  [check_effective_target_mips_loongson]) } {
 	    set et_vect_no_int_max_saved 1
 	}
     }
@@ -3553,8 +3667,9 @@ proc check_effective_target_vect_perm { } {
 	     || [istarget i?86-*-*]
 	     || [istarget x86_64-*-*]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mpaired_single]) } {
-            set et_vect_perm_saved 1
+		 && ([check_effective_target_mips_msa_nomips16_nomicromips]
+		     || [check_effective_target_mpaired_single])) } {
+		set et_vect_perm_saved 1
         }
     }
     verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
@@ -3578,7 +3693,9 @@ proc check_effective_target_vect_perm_byte { } {
 	     || ([istarget aarch64*-*-*]
 		 && [is-effective-target aarch64_little_endian])
 	     || [istarget powerpc*-*-*]
-             || [istarget spu-*-*] } {
+             || [istarget spu-*-*]
+	     || ([istarget mips-*.*]
+	         && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
             set et_vect_perm_byte_saved 1
         }
     }
@@ -3603,7 +3720,9 @@ proc check_effective_target_vect_perm_short { } {
 	     || ([istarget aarch64*-*-*]
 		 && [is-effective-target aarch64_little_endian])
 	     || [istarget powerpc*-*-*]
-             || [istarget spu-*-*] } {
+             || [istarget spu-*-*]
+	     || ([istarget mips*-*-*]
+	          && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
             set et_vect_perm_short_saved 1
         }
     }
@@ -3981,7 +4100,9 @@ proc check_effective_target_vect_pack_trunc { } {
              || [istarget aarch64*-*-*]
              || [istarget spu-*-*]
              || ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]
-		 && [check_effective_target_arm_little_endian]) } {
+		 && [check_effective_target_arm_little_endian])
+             || ([istarget mips*-*-*]
+                 && [check_effective_target_mips_msa]) } {
             set et_vect_pack_trunc_saved 1
         }
     }
@@ -4007,6 +4128,8 @@ proc check_effective_target_vect_unpack { } {
              || [istarget spu-*-*]
              || [istarget ia64-*-*]
              || [istarget aarch64*-*-*]
+             || ([istarget mips*-*-*]
+		 && [check_effective_target_mips_msa_nomips16_nomicromips])
              || ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]
 		 && [check_effective_target_arm_little_endian]) } {
             set et_vect_unpack_saved 1
@@ -4051,7 +4174,8 @@ proc check_effective_target_vect_no_align { } {
 	     || [istarget ia64-*-*]
 	     || [check_effective_target_arm_vect_no_misalign]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mips_loongson]) } {
+		 && ![check_effective_target_mips_msa]
+		 &&  [check_effective_target_mips_loongson]) } {
 	    set et_vect_no_align_saved 1
 	}
     }
@@ -4072,7 +4196,8 @@ proc check_effective_target_vect_hw_misalign { } {
         set et_vect_hw_misalign_saved 0
        if { ([istarget x86_64-*-*] 
 	    || [istarget aarch64*-*-*]
-            || [istarget i?86-*-*]) } {
+            || [istarget i?86-*-*])
+	    || ([istarget mips*-*-*] && [check_effective_target_mips_msa_nomips16_nomicromips]) } {
           set et_vect_hw_misalign_saved 1
        }
     }
@@ -4247,6 +4372,8 @@ proc check_effective_target_vect_condition { } {
 	     || [istarget i?86-*-*]
 	     || [istarget spu-*-*]
 	     || [istarget x86_64-*-*]
+	     || [istarget mips*-*-*]
+		 && [check_effective_target_mips_msa_nomips16_nomicromips]
 	     || ([istarget arm*-*-*] && [check_effective_target_arm_neon_ok]) } {
 	   set et_vect_cond_saved 1
 	}
@@ -4268,7 +4395,9 @@ proc check_effective_target_vect_cond_mixed { } {
 	set et_vect_cond_mixed_saved 0
 	if { [istarget i?86-*-*]
 	     || [istarget x86_64-*-*]
-	     || [istarget powerpc*-*-*] } {
+	     || [istarget powerpc*-*-*]
+	     || [istarget mips*-*-*]
+		 && [check_effective_target_mips_msa_nomips16_nomicromips] } {
 	   set et_vect_cond_mixed_saved 1
 	}
     }
@@ -4316,7 +4445,8 @@ proc check_effective_target_vect_short_mult { } {
 	     || [istarget aarch64*-*-*]
 	     || [check_effective_target_arm32]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mips_loongson]) } {
+		 && ([check_effective_target_mips_msa_nomips16_nomicromips]
+		     || [check_effective_target_mips_loongson])) } {
 	   set et_vect_short_mult_saved 1
 	}
     }
@@ -4340,6 +4470,8 @@ proc check_effective_target_vect_int_mult { } {
 	     || [istarget x86_64-*-*]
 	     || [istarget ia64-*-*]
 	     || [istarget aarch64*-*-*]
+	     || ([istarget mips*-*-*]
+		 && [check_effective_target_mips_msa_nomips16_nomicromips])
 	     || [check_effective_target_arm32] } {
 	   set et_vect_int_mult_saved 1
 	}
@@ -4366,7 +4498,8 @@ proc check_effective_target_vect_extract_even_odd { } {
              || [istarget ia64-*-*]
              || [istarget spu-*-*]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mpaired_single]) } {
+		 && ([check_effective_target_mips_msa_nomips16_nomicromips]
+		     || [check_effective_target_mpaired_single])) } {
 	    set et_vect_extract_even_odd_saved 1
         }
     }
@@ -4392,7 +4525,8 @@ proc check_effective_target_vect_interleave { } {
              || [istarget ia64-*-*]
              || [istarget spu-*-*]
 	     || ([istarget mips*-*-*]
-		 && [check_effective_target_mpaired_single]) } {
+		 && ([check_effective_target_mips_msa_nomips16_nomicromips]
+		     || [check_effective_target_mpaired_single])) } {
            set et_vect_interleave_saved 1
         }
     }
@@ -5647,6 +5781,7 @@ proc check_effective_target_mempcpy {} {
 
 proc check_vect_support_and_set_flags { } {
     global DEFAULT_VECTCFLAGS
+    global MULTI_VECTCFLAGS
     global dg-do-what-default
 
     if  [istarget powerpc-*paired*]  {
@@ -5687,14 +5822,25 @@ proc check_vect_support_and_set_flags { } {
         } else {
             set dg-do-what-default compile
         }
-    } elseif { [istarget mips*-*-*]
-               && ([check_effective_target_mpaired_single]
+    } elseif { [istarget mips*-*-*] } {
+        if { 0 && ([check_effective_target_mpaired_single]
                     || [check_effective_target_mips_loongson])
-               && [check_effective_target_nomips16] } {
-        if { [check_effective_target_mpaired_single] } {
-            lappend DEFAULT_VECTCFLAGS "-mpaired-single"
+             && [check_effective_target_nomips16]
+             && [check_effective_target_mpaired_single] } {
+            lappend MULTI_VECTCFLAGS "-mpaired-single"
+            set dg-do-what-default run
+        }
+        if { ([check_effective_target_mips_msa_nomips16_nomicromips]) } {
+            lappend MULTI_VECTCFLAGS "-mmsa"
+
+            if { [check_effective_target_msa_runtime] } {
+                set dg-do-what-default run
+            } else {
+                set dg-do-what-default compile
+            }
+        } else {
+            return 0
         }
-        set dg-do-what-default run
     } elseif [istarget sparc*-*-*] {
         lappend DEFAULT_VECTCFLAGS "-mcpu=ultrasparc" "-mvis"
         if [check_effective_target_ultrasparc_hw] {
diff --git a/gcc/toplev.c b/gcc/toplev.c
index 21e07bc8e9b..15f39c52dbf 100644
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@@ -2004,7 +2004,7 @@ toplev_main (int argc, char **argv)
      enough to default flags appropriately.  */
   decode_options (&global_options, &global_options_set,
 		  save_decoded_options, save_decoded_options_count,
-		  UNKNOWN_LOCATION, global_dc);
+		  UNKNOWN_LOCATION, global_dc, true);
 
   handle_common_deferred_options ();
 
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index bb89feee1be..77dca96830a 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -766,6 +766,16 @@ struct GTY(()) tree_base {
     int length;
     /* SSA version number.  This field is only used with SSA_NAME.  */
     unsigned int version;
+
+    /* The following two fields are used for MEM_REF and TARGET_MEM_REF
+       expression trees and specify known data non-dependences.  For
+       two memory references in a function they are known to not
+       alias if dependence_info.clique are equal and dependence_info.base
+       are distinct.  */
+    struct {
+      unsigned short clique;
+      unsigned short base;
+    } dependence_info;
   } GTY((skip(""))) u;
 };
 
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 808daf70b39..bd681d70b73 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -1000,9 +1000,12 @@ dr_analyze_indices (struct data_reference *dr, loop_p nest, loop_p loop)
 	     guaranteed.
 	     As a band-aid, mark the access so we can special-case
 	     it in dr_may_alias_p.  */
+	  tree old = ref;
 	  ref = fold_build2_loc (EXPR_LOCATION (ref),
 				 MEM_REF, TREE_TYPE (ref),
 				 base, memoff);
+	  MR_DEPENDENCE_CLIQUE (ref) = MR_DEPENDENCE_CLIQUE (old);
+	  MR_DEPENDENCE_BASE (ref) = MR_DEPENDENCE_BASE (old);
 	  access_fns.safe_push (access_fn);
 	}
     }
@@ -1409,6 +1412,12 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b,
 	return false;
     }
 
+  if ((TREE_CODE (addr_a) == MEM_REF || TREE_CODE (addr_a) == TARGET_MEM_REF)
+      && (TREE_CODE (addr_b) == MEM_REF || TREE_CODE (addr_b) == TARGET_MEM_REF)
+      && MR_DEPENDENCE_CLIQUE (addr_a) == MR_DEPENDENCE_CLIQUE (addr_b)
+      && MR_DEPENDENCE_BASE (addr_a) != MR_DEPENDENCE_BASE (addr_b))
+    return false;
+
   /* If we had an evolution in a pointer-based MEM_REF BASE_OBJECT we
      do not know the size of the base-object.  So we cannot do any
      offset/overlap based analysis but have to rely on points-to
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index 7ae5d64927b..aa5d0f1cb6f 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -787,6 +787,28 @@ is_parm (tree decl)
   return (TREE_CODE (decl) == PARM_DECL);
 }
 
+/* Remap the dependence CLIQUE from the source to the destination function
+   as specified in ID.  */
+
+static unsigned short
+remap_dependence_clique (copy_body_data *id, unsigned short clique)
+{
+  if (clique == 0)
+    return 0;
+  if (!id->dependence_map)
+    id->dependence_map = pointer_map_create ();
+  void **newc = pointer_map_contains (id->dependence_map,
+				      (void *)(uintptr_t)clique);
+  if (!newc)
+    {
+      newc = pointer_map_insert (id->dependence_map,
+			         (void *)(uintptr_t)clique);
+      *newc = (void *)(uintptr_t)++cfun->last_clique;
+    }
+
+  return (uintptr_t)*newc;
+}
+
 /* Remap the GIMPLE operand pointed to by *TP.  DATA is really a
    'struct walk_stmt_info *'.  DATA->INFO is a 'copy_body_data *'.
    WALK_SUBTREES is used to indicate walk_gimple_op whether to keep
@@ -886,6 +908,12 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data)
 	  TREE_THIS_VOLATILE (*tp) = TREE_THIS_VOLATILE (old);
 	  TREE_SIDE_EFFECTS (*tp) = TREE_SIDE_EFFECTS (old);
 	  TREE_NO_WARNING (*tp) = TREE_NO_WARNING (old);
+	  if (MR_DEPENDENCE_CLIQUE (old) != 0)
+	    {
+	      MR_DEPENDENCE_CLIQUE (*tp)
+	        = remap_dependence_clique (id, MR_DEPENDENCE_CLIQUE (old));
+	      MR_DEPENDENCE_BASE (*tp) = MR_DEPENDENCE_BASE (old);
+	    }
 	  /* We cannot propagate the TREE_THIS_NOTRAP flag if we have
 	     remapped a parameter as the property might be valid only
 	     for the parameter itself.  */
@@ -1139,6 +1167,12 @@ copy_tree_body_r (tree *tp, int *walk_subtrees, void *data)
 	  TREE_THIS_VOLATILE (*tp) = TREE_THIS_VOLATILE (old);
 	  TREE_SIDE_EFFECTS (*tp) = TREE_SIDE_EFFECTS (old);
 	  TREE_NO_WARNING (*tp) = TREE_NO_WARNING (old);
+	  if (MR_DEPENDENCE_CLIQUE (old) != 0)
+	    {
+	      MR_DEPENDENCE_CLIQUE (*tp)
+		= remap_dependence_clique (id, MR_DEPENDENCE_CLIQUE (old));
+	      MR_DEPENDENCE_BASE (*tp) = MR_DEPENDENCE_BASE (old);
+	    }
 	  /* We cannot propagate the TREE_THIS_NOTRAP flag if we have
 	     remapped a parameter as the property might be valid only
 	     for the parameter itself.  */
@@ -2598,6 +2632,11 @@ copy_cfg_body (copy_body_data * id, gcov_type count, int frequency_scale,
       pointer_map_destroy (id->eh_map);
       id->eh_map = NULL;
     }
+  if (id->dependence_map)
+    {
+      pointer_map_destroy (id->dependence_map);
+      id->dependence_map = NULL;
+    }
 
   return new_fndecl;
 }
@@ -4953,6 +4992,11 @@ copy_gimple_seq_and_replace_locals (gimple_seq seq)
   pointer_map_destroy (id.decl_map);
   if (id.debug_map)
     pointer_map_destroy (id.debug_map);
+  if (id.dependence_map)
+    {
+      pointer_map_destroy (id.dependence_map);
+      id.dependence_map = NULL;
+    }
 
   return copy;
 }
diff --git a/gcc/tree-inline.h b/gcc/tree-inline.h
index 13c551666dd..8ad47f10172 100644
--- a/gcc/tree-inline.h
+++ b/gcc/tree-inline.h
@@ -135,6 +135,10 @@ struct copy_body_data
   /* Cilk keywords currently need to replace some variables that
      ordinary nested functions do not.  */ 
   bool remap_var_for_cilk;
+
+  /* A map from the inlined functions dependence info cliques to
+     equivalents in the function into which it is being inlined.  */
+  struct pointer_map_t *dependence_map;
 };
 
 /* Weights of constructions for estimate_num_insns.  */
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index c3ec6c02d83..68530c9eb1d 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -1078,7 +1078,9 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
 	    /* Same value types ignoring qualifiers.  */
 	    && (TYPE_MAIN_VARIANT (TREE_TYPE (node))
 		== TYPE_MAIN_VARIANT
-		    (TREE_TYPE (TREE_TYPE (TREE_OPERAND (node, 1))))))
+		    (TREE_TYPE (TREE_TYPE (TREE_OPERAND (node, 1)))))
+	    && (!(flags & TDF_ALIAS)
+		|| MR_DEPENDENCE_CLIQUE (node) == 0))
 	  {
 	    if (TREE_CODE (TREE_OPERAND (node, 0)) != ADDR_EXPR)
 	      {
@@ -1109,6 +1111,14 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
 		dump_generic_node (buffer, TREE_OPERAND (node, 1),
 				   spc, flags, false);
 	      }
+	    if ((flags & TDF_ALIAS)
+		&& MR_DEPENDENCE_CLIQUE (node) != 0)
+	      {
+		pp_string (buffer, " clique ");
+		pp_unsigned_wide_integer (buffer, MR_DEPENDENCE_CLIQUE (node));
+		pp_string (buffer, " base ");
+		pp_unsigned_wide_integer (buffer, MR_DEPENDENCE_BASE (node));
+	      }
 	    pp_right_bracket (buffer);
 	  }
 	break;
@@ -1434,7 +1444,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
 		  /* Same value types ignoring qualifiers.  */
 		  && (TYPE_MAIN_VARIANT (TREE_TYPE (op0))
 		      == TYPE_MAIN_VARIANT
-		          (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 1))))))))
+		          (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 1)))))
+		  && MR_DEPENDENCE_CLIQUE (op0) == 0)))
 	{
 	  op0 = TREE_OPERAND (op0, 0);
 	  str = "->";
diff --git a/gcc/tree-profile.c b/gcc/tree-profile.c
index d3fe66f2052..9504da45cd2 100644
--- a/gcc/tree-profile.c
+++ b/gcc/tree-profile.c
@@ -264,6 +264,9 @@ static tree GTY(()) gcov_lipo_strict_inclusion = NULL_TREE;
 /* extern gcov_unsigned_t __gcov_lipo_comdat_algorithm  */
 static tree GTY(()) gcov_lipo_comdat_algorithm = NULL_TREE;
 
+/* extern gcov_unsigned_t __gcov_lipo_sampling_period  */
+static tree GTY(()) gcov_lipo_sampling_period = NULL_TREE;
+
 /* Insert STMT_IF around given sequence of consecutive statements in the
    same basic block starting with STMT_START, ending with STMT_END.
    PROB is the probability of the taken branch.  */
@@ -508,6 +511,13 @@ tree_init_dyn_ipa_parameters (void)
           get_gcov_unsigned_t ());
       init_comdat_decl (gcov_lipo_comdat_algorithm,
                         PARAM_LIPO_COMDAT_ALGORITHM);
+      gcov_lipo_sampling_period = build_decl (
+          UNKNOWN_LOCATION,
+          VAR_DECL,
+          get_identifier ("__gcov_lipo_sampling_period"),
+          get_gcov_unsigned_t ());
+      init_comdat_decl (gcov_lipo_sampling_period,
+                        PARAM_LIPO_SAMPLING_PERIOD);
     }
 }
 
@@ -854,8 +864,20 @@ gimple_gen_edge_profiler (int edgeno, edge e)
     {
       gimple call;
       tree tree_edgeno = build_int_cst (gcov_type_node, edgeno);
-      tree tree_uid = build_int_cst (gcov_type_node,
+
+      tree tree_uid;
+      if (PARAM_VALUE (PARAM_PROFILE_FUNC_INTERNAL_ID))
+        {
+          tree_uid  = build_int_cst (gcov_type_node,
                                      current_function_funcdef_no);
+        }
+      else
+        {
+          gcc_assert (coverage_node_map_initialized_p ());
+
+          tree_uid = build_int_cst
+      (gcov_type_node, cgraph_get_node (current_function_decl)->profile_id);
+        }
       tree callback_fn_type
               = build_function_type_list (void_type_node,
                                           gcov_type_node,
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index de3fd05985b..9a02de39932 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -1405,7 +1405,36 @@ refs_may_alias_p_1 (ao_ref *ref1, ao_ref *ref2, bool tbaa_p)
 					  ao_ref_alias_set (ref1),
 					  ao_ref_base_alias_set (ref1),
 					  tbaa_p);
-  else if (ind1_p && ind2_p)
+
+  /* Handle restrict based accesses.
+     ???  ao_ref_base strips inner MEM_REF [&decl], recover from that
+     here.  */
+  tree rbase1 = base1;
+  tree rbase2 = base2;
+  if (var1_p)
+    {
+      rbase1 = ref1->ref;
+      if (rbase1)
+	while (handled_component_p (rbase1))
+	  rbase1 = TREE_OPERAND (rbase1, 0);
+    }
+  if (var2_p)
+    {
+      rbase2 = ref2->ref;
+      if (rbase2)
+	 while (handled_component_p (rbase2))
+	   rbase2 = TREE_OPERAND (rbase2, 0);
+    }
+  if (rbase1 && rbase2
+      && (TREE_CODE (base1) == MEM_REF || TREE_CODE (base1) == TARGET_MEM_REF)
+      && (TREE_CODE (base2) == MEM_REF || TREE_CODE (base2) == TARGET_MEM_REF)
+      /* If the accesses are in the same restrict clique... */
+      && MR_DEPENDENCE_CLIQUE (base1) == MR_DEPENDENCE_CLIQUE (base2)
+      /* But based on different pointers they do not alias.  */
+      && MR_DEPENDENCE_BASE (base1) != MR_DEPENDENCE_BASE (base2))
+    return false;
+
+  if (ind1_p && ind2_p)
     return indirect_refs_may_alias_p (ref1->ref, base1,
 				      offset1, max_size1,
 				      ao_ref_alias_set (ref1), -1,
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 78f036ebd06..c5a5dd48ac3 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -5863,6 +5863,108 @@ iv_ca_prune (struct ivopts_data *data, struct iv_ca *ivs,
   return best_cost;
 }
 
+/* Check if CAND_IDX is a candidate other than OLD_CAND and has
+   cheaper local cost for USE than BEST_CP.  Return pointer to
+   the corresponding cost_pair, otherwise just return BEST_CP.  */
+
+static struct cost_pair*
+cheaper_cost_with_cand (struct ivopts_data *data, struct iv_use *use,
+			unsigned int cand_idx, struct iv_cand *old_cand,
+			struct cost_pair *best_cp)
+{
+  struct iv_cand *cand;
+  struct cost_pair *cp;
+
+  gcc_assert (old_cand != NULL && best_cp != NULL);
+  if (cand_idx == old_cand->id)
+    return best_cp;
+
+  cand = iv_cand (data, cand_idx);
+  cp = get_use_iv_cost (data, use, cand);
+  if (cp != NULL && cheaper_cost_pair (cp, best_cp))
+    return cp;
+
+  return best_cp;
+}
+
+/* Try breaking local optimal fixed-point for IVS by replacing candidates
+   which are used by more than one iv uses.  For each of those candidates,
+   this function tries to represent iv uses under that candidate using
+   other ones with lower local cost, then tries to prune the new set.
+   If the new set has lower cost, It returns the new cost after recording
+   candidate replacement in list DELTA.  */
+
+static comp_cost
+iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
+	       struct iv_ca_delta **delta)
+{
+  bitmap_iterator bi, bj;
+  unsigned int i, j, k;
+  struct iv_use *use;
+  struct iv_cand *cand;
+  comp_cost orig_cost, acost;
+  struct iv_ca_delta *act_delta, *tmp_delta;
+  struct cost_pair *old_cp, *best_cp = NULL;
+
+  *delta = NULL;
+  orig_cost = iv_ca_cost (ivs);
+
+  EXECUTE_IF_SET_IN_BITMAP (ivs->cands, 0, i, bi)
+    {
+      if (ivs->n_cand_uses[i] == 1
+	  || ivs->n_cand_uses[i] > ALWAYS_PRUNE_CAND_SET_BOUND)
+	continue;
+
+      cand = iv_cand (data, i);
+  
+      act_delta = NULL;
+      /*  Represent uses under current candidate using other ones with
+	  lower local cost.  */
+      for (j = 0; j < ivs->upto; j++)
+	{
+	  use = iv_use (data, j);
+	  old_cp = iv_ca_cand_for_use (ivs, use);
+
+	  if (old_cp->cand != cand)
+	    continue;
+
+	  best_cp = old_cp;
+	  if (data->consider_all_candidates)
+	    for (k = 0; k < n_iv_cands (data); k++)
+	      best_cp = cheaper_cost_with_cand (data, use, k,
+						old_cp->cand, best_cp);
+	  else
+	    EXECUTE_IF_SET_IN_BITMAP (use->related_cands, 0, k, bj)
+	      best_cp = cheaper_cost_with_cand (data, use, k,
+						old_cp->cand, best_cp);
+
+	  if (best_cp == old_cp)
+	    continue;
+
+	  act_delta = iv_ca_delta_add (use, old_cp, best_cp, act_delta);
+	}
+      /* No need for further prune.  */
+      if (!act_delta)
+	continue;
+
+      /* Prune the new candidate set.  */
+      iv_ca_delta_commit (data, ivs, act_delta, true);
+      acost = iv_ca_prune (data, ivs, NULL, &tmp_delta);
+      iv_ca_delta_commit (data, ivs, act_delta, false);
+      act_delta = iv_ca_delta_join (act_delta, tmp_delta);
+
+      if (compare_costs (acost, orig_cost) < 0)
+	{
+	  *delta = act_delta;
+	  return acost;
+	}
+      else
+	iv_ca_delta_free (&act_delta);
+    }
+
+  return orig_cost;
+}
+
 /* Tries to extend the sets IVS in the best possible way in order
    to express the USE.  If ORIGINALP is true, prefer candidates from
    the original set of IVs, otherwise favor important candidates not
@@ -6005,10 +6107,13 @@ get_initial_solution (struct ivopts_data *data, bool originalp)
   return ivs;
 }
 
-/* Tries to improve set of induction variables IVS.  */
+/* Tries to improve set of induction variables IVS.  TRY_REPLACE_P
+   points to a bool variable, this function tries to break local
+   optimal fixed-point by replacing candidates in IVS if it's true.  */
 
 static bool
-try_improve_iv_set (struct ivopts_data *data, struct iv_ca *ivs)
+try_improve_iv_set (struct ivopts_data *data,
+		    struct iv_ca *ivs, bool *try_replace_p)
 {
   unsigned i, n_ivs;
   comp_cost acost, best_cost = iv_ca_cost (ivs);
@@ -6052,7 +6157,20 @@ try_improve_iv_set (struct ivopts_data *data, struct iv_ca *ivs)
       /* Try removing the candidates from the set instead.  */
       best_cost = iv_ca_prune (data, ivs, NULL, &best_delta);
 
-      /* Nothing more we can do.  */
+      if (!best_delta && *try_replace_p)
+	{
+	  *try_replace_p = false;
+	  /* So far candidate selecting algorithm tends to choose fewer IVs
+	     so that it can handle cases in which loops have many variables
+	     but the best choice is often to use only one general biv.  One
+	     weakness is it can't handle opposite cases, in which different
+	     candidates should be chosen with respect to each use.  To solve
+	     the problem, we replace candidates in a manner described by the
+	     comments of iv_ca_replace, thus give general algorithm a chance
+	     to break local optimal fixed-point in these cases.  */
+	  best_cost = iv_ca_replace (data, ivs, &best_delta);
+	}
+
       if (!best_delta)
 	return false;
     }
@@ -6071,6 +6189,7 @@ static struct iv_ca *
 find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
 {
   struct iv_ca *set;
+  bool try_replace_p = true;
 
   /* Get the initial solution.  */
   set = get_initial_solution (data, originalp);
@@ -6087,7 +6206,7 @@ find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp)
       iv_ca_dump (data, dump_file, set);
     }
 
-  while (try_improve_iv_set (data, set))
+  while (try_improve_iv_set (data, set, &try_replace_p))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
index 897b8f51895..8fb72b6a50c 100644
--- a/gcc/tree-ssa-loop-niter.c
+++ b/gcc/tree-ssa-loop-niter.c
@@ -2727,6 +2727,7 @@ record_nonwrapping_iv (struct loop *loop, tree base, tree step, gimple stmt,
   tree niter_bound, extreme, delta;
   tree type = TREE_TYPE (base), unsigned_type;
   double_int max;
+  tree orig_base = base;
 
   if (TREE_CODE (step) != INTEGER_CST || integer_zerop (step))
     return;
@@ -2750,7 +2751,14 @@ record_nonwrapping_iv (struct loop *loop, tree base, tree step, gimple stmt,
 
   if (tree_int_cst_sign_bit (step))
     {
+      double_int min, max;
       extreme = fold_convert (unsigned_type, low);
+      if (TREE_CODE (orig_base) == SSA_NAME
+	  && TREE_CODE (high) == INTEGER_CST
+	  && INTEGRAL_TYPE_P (TREE_TYPE (orig_base))
+	  && get_range_info (orig_base, &min, &max) == VR_RANGE
+	  && max.slt (TREE_INT_CST (high)))
+	base = double_int_to_tree (unsigned_type, max);
       if (TREE_CODE (base) != INTEGER_CST)
 	base = fold_convert (unsigned_type, high);
       delta = fold_build2 (MINUS_EXPR, unsigned_type, base, extreme);
@@ -2758,8 +2766,15 @@ record_nonwrapping_iv (struct loop *loop, tree base, tree step, gimple stmt,
     }
   else
     {
+      double_int min, max;
       extreme = fold_convert (unsigned_type, high);
-      if (TREE_CODE (base) != INTEGER_CST)
+      if (TREE_CODE (orig_base) == SSA_NAME
+	  && TREE_CODE (low) == INTEGER_CST
+	  && INTEGRAL_TYPE_P (TREE_TYPE (orig_base))
+	  && get_range_info (orig_base, &min, &max) == VR_RANGE
+	  && min.sgt (TREE_INT_CST (low)))
+	base = double_int_to_tree (unsigned_type, min);
+      else if (TREE_CODE (base) != INTEGER_CST)
 	base = fold_convert (unsigned_type, low);
       delta = fold_build2 (MINUS_EXPR, unsigned_type, extreme, base);
     }
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index abc99ba377a..f44667b5789 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -53,6 +53,10 @@
 #include "splay-tree.h"
 #include "params.h"
 #include "alias.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "tree-pretty-print.h"
+#include "gimple-walk.h"
 
 /* The idea behind this analyzer is to generate set constraints from the
    program, then solve the resulting constraints in order to generate the
@@ -273,12 +277,19 @@ struct variable_info
   /* True if this field has only restrict qualified pointers.  */
   unsigned int only_restrict_pointers : 1;
 
+  /* True if this represents a heap var created for a restrict qualified
+     pointer.  */
+  unsigned int is_restrict_var : 1;
+
   /* True if this represents a global variable.  */
   unsigned int is_global_var : 1;
 
   /* True if this represents a IPA function info.  */
   unsigned int is_fn_info : 1;
 
+  /* ???  Store somewhere better.  */
+  unsigned short ruid;
+
   /* The ID of the variable for the next field in this structure
      or zero for the last field in this structure.  */
   unsigned next;
@@ -370,6 +381,7 @@ new_var_info (tree t, const char *name)
   ret->is_heap_var = false;
   ret->may_have_pointers = true;
   ret->only_restrict_pointers = false;
+  ret->is_restrict_var = false;
   ret->is_global_var = (t == NULL_TREE);
   ret->is_fn_info = false;
   if (t && DECL_P (t))
@@ -3782,6 +3794,7 @@ static varinfo_t
 make_constraint_from_restrict (varinfo_t lhs, const char *name)
 {
   varinfo_t vi = make_heapvar (name);
+  vi->is_restrict_var = 1;
   vi->is_global_var = 1;
   vi->may_have_pointers = 1;
   make_constraint_from (lhs, vi->id);
@@ -5760,7 +5773,11 @@ create_variable_info_for (tree decl, const char *name)
 	   && TYPE_RESTRICT (TREE_TYPE (decl)))
 	  || vi->only_restrict_pointers)
 	{
-	  make_constraint_from_global_restrict (vi, "GLOBAL_RESTRICT");
+	  varinfo_t rvi
+	    = make_constraint_from_global_restrict (vi, "GLOBAL_RESTRICT");
+	  /* ???  For now exclude reads from globals as restrict sources
+	     if those are not (indirectly) from incoming parameters.  */
+	  rvi->is_restrict_var = false;
 	  continue;
 	}
 
@@ -5870,6 +5887,7 @@ intra_create_variable_infos (void)
 	  tree heapvar = build_fake_var_decl (TREE_TYPE (TREE_TYPE (t)));
 	  DECL_EXTERNAL (heapvar) = 1;
 	  vi = create_variable_info_for_1 (heapvar, "PARM_NOALIAS");
+	  vi->is_restrict_var = 1;
 	  insert_vi_for_tree (heapvar, vi);
 	  lhsc.var = p->id;
 	  lhsc.type = SCALAR;
@@ -6953,6 +6971,186 @@ delete_points_to_sets (void)
   obstack_free (&final_solutions_obstack, NULL);
 }
 
+/* Mark "other" loads and stores as belonging to CLIQUE and with
+   base zero.  */
+
+static bool
+visit_loadstore (gimple, tree base, tree ref, void *clique_)
+{
+  unsigned short clique = (uintptr_t)clique_;
+  if (TREE_CODE (base) == MEM_REF
+      || TREE_CODE (base) == TARGET_MEM_REF)
+    {
+      tree ptr = TREE_OPERAND (base, 0);
+      if (TREE_CODE (ptr) == SSA_NAME)
+	{
+	  /* ???  We need to make sure 'ptr' doesn't include any of
+	     the restrict tags in its points-to set.  */
+	  return false;
+	}
+
+      /* For now let decls through.  */
+
+      /* Do not overwrite existing cliques (that includes clique, base
+         pairs we just set).  */
+      if (MR_DEPENDENCE_CLIQUE (base) == 0)
+	{
+	  MR_DEPENDENCE_CLIQUE (base) = clique;
+	  MR_DEPENDENCE_BASE (base) = 0;
+	}
+    }
+
+  /* For plain decl accesses see whether they are accesses to globals
+     and rewrite them to MEM_REFs with { clique, 0 }.  */
+  if (TREE_CODE (base) == VAR_DECL
+      && is_global_var (base)
+      /* ???  We can't rewrite a plain decl with the walk_stmt_load_store
+	 ops callback.  */
+      && base != ref)
+    {
+      tree *basep = &ref;
+      while (handled_component_p (*basep))
+	basep = &TREE_OPERAND (*basep, 0);
+      gcc_assert (TREE_CODE (*basep) == VAR_DECL);
+      tree ptr = build_fold_addr_expr (*basep);
+      tree zero = build_int_cst (TREE_TYPE (ptr), 0);
+      *basep = build2 (MEM_REF, TREE_TYPE (*basep), ptr, zero);
+      MR_DEPENDENCE_CLIQUE (*basep) = clique;
+      MR_DEPENDENCE_BASE (*basep) = 0;
+    }
+
+  return false;
+}
+
+/* If REF is a MEM_REF then assign a clique, base pair to it, updating
+   CLIQUE, *RESTRICT_VAR and LAST_RUID.  Return whether dependence info
+   was assigned to REF.  */
+
+static bool
+maybe_set_dependence_info (tree ref, tree ptr,
+			   unsigned short &clique, varinfo_t restrict_var,
+			   unsigned short &last_ruid)
+{
+  while (handled_component_p (ref))
+    ref = TREE_OPERAND (ref, 0);
+  if ((TREE_CODE (ref) == MEM_REF
+       || TREE_CODE (ref) == TARGET_MEM_REF)
+      && TREE_OPERAND (ref, 0) == ptr)
+    {
+      /* Do not overwrite existing cliques.  This avoids overwriting dependence
+	 info inlined from a function with restrict parameters inlined
+	 into a function with restrict parameters.  This usually means we
+	 prefer to be precise in innermost loops.  */
+      if (MR_DEPENDENCE_CLIQUE (ref) == 0)
+	{
+	  if (clique == 0)
+	    clique = ++cfun->last_clique;
+	  if (restrict_var->ruid == 0)
+	    restrict_var->ruid = ++last_ruid;
+	  MR_DEPENDENCE_CLIQUE (ref) = clique;
+	  MR_DEPENDENCE_BASE (ref) = restrict_var->ruid;
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Compute the set of independend memory references based on restrict
+   tags and their conservative propagation to the points-to sets.  */
+
+static void
+compute_dependence_clique (void)
+{
+  unsigned short clique = 0;
+  unsigned short last_ruid = 0;
+  for (unsigned i = 0; i < num_ssa_names; ++i)
+    {
+      tree ptr = ssa_name (i);
+      if (!ptr || !POINTER_TYPE_P (TREE_TYPE (ptr)))
+	continue;
+
+      /* Avoid all this when ptr is not dereferenced?  */
+      tree p = ptr;
+      if (SSA_NAME_IS_DEFAULT_DEF (ptr)
+	  && (TREE_CODE (SSA_NAME_VAR (ptr)) == PARM_DECL
+	      || TREE_CODE (SSA_NAME_VAR (ptr)) == RESULT_DECL))
+	p = SSA_NAME_VAR (ptr);
+      varinfo_t vi = lookup_vi_for_tree (p);
+      if (!vi)
+	continue;
+      vi = get_varinfo (find (vi->id));
+      bitmap_iterator bi;
+      unsigned j;
+      varinfo_t restrict_var = NULL;
+      EXECUTE_IF_SET_IN_BITMAP (vi->solution, 0, j, bi)
+	{
+	  varinfo_t oi = get_varinfo (j);
+	  if (oi->is_restrict_var)
+	    {
+	      if (restrict_var)
+		{
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "found restrict pointed-to "
+			       "for ");
+		      print_generic_expr (dump_file, ptr, 0);
+		      fprintf (dump_file, " but not exclusively\n");
+		    }
+		  restrict_var = NULL;
+		  break;
+		}
+	      restrict_var = oi;
+	    }
+	  /* NULL is the only other valid points-to entry.  */
+	  else if (oi->id != nothing_id)
+	    {
+	      restrict_var = NULL;
+	      break;
+	    }
+	}
+      /* Ok, found that ptr must(!) point to a single(!) restrict
+	 variable.  */
+      /* ???  PTA isn't really a proper propagation engine to compute
+	 this property.
+	 ???  We could handle merging of two restricts by unifying them.  */
+      if (restrict_var)
+	{
+	  /* Now look at possible dereferences of ptr.  */
+	  imm_use_iterator ui;
+	  gimple use_stmt;
+	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, ptr)
+	    {
+	      /* ???  Calls and asms.  */
+	      if (!gimple_assign_single_p (use_stmt))
+		continue;
+	      maybe_set_dependence_info (gimple_assign_lhs (use_stmt), ptr,
+					 clique, restrict_var, last_ruid);
+	      maybe_set_dependence_info (gimple_assign_rhs1 (use_stmt), ptr,
+					 clique, restrict_var, last_ruid);
+	    }
+	}
+    }
+
+  if (clique == 0)
+    return;
+
+  /* Assign the BASE id zero to all accesses not based on a restrict
+     pointer.  That way they get disabiguated against restrict
+     accesses but not against each other.  */
+  /* ???  For restricts derived from globals (thus not incoming
+     parameters) we can't restrict scoping properly thus the following
+     is too aggressive there.  For now we have excluded those globals from
+     getting into the MR_DEPENDENCE machinery.  */
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+	 !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple stmt = gsi_stmt (gsi);
+	walk_stmt_load_store_ops (stmt, (void *)(uintptr_t)clique,
+				  visit_loadstore, visit_loadstore);
+      }
+}
 
 /* Compute points-to information for every SSA_NAME pointer in the
    current function and compute the transitive closure of escaped
@@ -6984,6 +7182,9 @@ compute_may_aliases (void)
   if (dump_file)
     dump_alias_info (dump_file);
 
+  /* Compute restrict-based memory disambiguations.  */
+  compute_dependence_clique ();
+
   /* Deallocate memory used by aliasing data structures and the internal
      points-to solution.  */
   delete_points_to_sets ();
diff --git a/gcc/tree-streamer-in.c b/gcc/tree-streamer-in.c
index 1839f579e96..4dad86cad15 100644
--- a/gcc/tree-streamer-in.c
+++ b/gcc/tree-streamer-in.c
@@ -496,7 +496,18 @@ unpack_value_fields (struct data_in *data_in, struct bitpack_d *bp, tree expr)
     unpack_ts_type_common_value_fields (bp, expr);
 
   if (CODE_CONTAINS_STRUCT (code, TS_EXP))
-    SET_EXPR_LOCATION (expr, stream_input_location (bp, data_in));
+    {
+      SET_EXPR_LOCATION (expr, stream_input_location (bp, data_in));
+      if (code == MEM_REF
+	  || code == TARGET_MEM_REF)
+	{
+	  MR_DEPENDENCE_CLIQUE (expr)
+	    = (unsigned)bp_unpack_value (bp, sizeof (short) * 8);
+	  if (MR_DEPENDENCE_CLIQUE (expr) != 0)
+	    MR_DEPENDENCE_BASE (expr)
+	      = (unsigned)bp_unpack_value (bp, sizeof (short) * 8);
+	}
+    }
 
   if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
     unpack_ts_block_value_fields (data_in, bp, expr);
diff --git a/gcc/tree-streamer-out.c b/gcc/tree-streamer-out.c
index 90dec0a1ce6..ff8f701351f 100644
--- a/gcc/tree-streamer-out.c
+++ b/gcc/tree-streamer-out.c
@@ -454,7 +454,16 @@ streamer_pack_tree_bitfields (struct output_block *ob,
     pack_ts_type_common_value_fields (bp, expr);
 
   if (CODE_CONTAINS_STRUCT (code, TS_EXP))
-    stream_output_location (ob, bp, EXPR_LOCATION (expr));
+    {
+      stream_output_location (ob, bp, EXPR_LOCATION (expr));
+      if (code == MEM_REF
+	  || code == TARGET_MEM_REF)
+	{
+	  bp_pack_value (bp, MR_DEPENDENCE_CLIQUE (expr), sizeof (short) * 8);
+	  if (MR_DEPENDENCE_CLIQUE (expr) != 0)
+	    bp_pack_value (bp, MR_DEPENDENCE_BASE (expr), sizeof (short) * 8);
+	}
+    }
 
   if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
     pack_ts_block_value_fields (ob, bp, expr);
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 49303b1b58a..7af32d1ff6a 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4391,13 +4391,14 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
   enum machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_store_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_store_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+			 "the size of the group of accesses"
+			 " is not a power of 2 or not eqaul to 3\n");
       return false;
     }
 
@@ -4406,23 +4407,76 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
     {
       unsigned int i, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
-      for (i = 0; i < nelt / 2; i++)
+
+      if (count == 3)
 	{
-	  sel[i * 2] = i;
-	  sel[i * 2 + 1] = i + nelt;
+	  unsigned int j0 = 0, j1 = 0, j2 = 0;
+	  unsigned int i, j;
+
+	  for (j = 0; j < 3; j++)
+	    {
+	      int nelt0 = ((3 - j) * nelt) % 3;
+	      int nelt1 = ((3 - j) * nelt + 1) % 3;
+	      int nelt2 = ((3 - j) * nelt + 2) % 3;
+	      for (i = 0; i < nelt; i++)
+		{
+		  if (3 * i + nelt0 < nelt)
+		    sel[3 * i + nelt0] = j0++;
+		  if (3 * i + nelt1 < nelt)
+		    sel[3 * i + nelt1] = nelt + j1++;
+		  if (3 * i + nelt2 < nelt)
+		    sel[3 * i + nelt2] = 0;
+		}
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf (MSG_MISSED_OPTIMIZATION,
+				 "permutaion op not supported by target.\n");
+		  return false;
+		}
+
+	      for (i = 0; i < nelt; i++)
+		{
+		  if (3 * i + nelt0 < nelt)
+		    sel[3 * i + nelt0] = 3 * i + nelt0;
+		  if (3 * i + nelt1 < nelt)
+		    sel[3 * i + nelt1] = 3 * i + nelt1;
+		  if (3 * i + nelt2 < nelt)
+		    sel[3 * i + nelt2] = nelt + j2++;
+		}
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf (MSG_MISSED_OPTIMIZATION,
+				 "permutaion op not supported by target.\n");
+		  return false;
+		}
+	    }
+	  return true;
 	}
-      if (can_vec_perm_p (mode, false, sel))
+      else
 	{
-	  for (i = 0; i < nelt; i++)
-	    sel[i] += nelt / 2;
-	  if (can_vec_perm_p (mode, false, sel))
-	    return true;
+	  /* If length is not equal to 3 then only power of 2 is supported.  */
+	  gcc_assert (exact_log2 (count) != -1);
+
+	  for (i = 0; i < nelt / 2; i++)
+	    {
+	      sel[i * 2] = i;
+	      sel[i * 2 + 1] = i + nelt;
+	    }
+	    if (can_vec_perm_p (mode, false, sel))
+	      {
+		for (i = 0; i < nelt; i++)
+		  sel[i] += nelt / 2;
+		if (can_vec_perm_p (mode, false, sel))
+		  return true;
+	      }
 	}
     }
 
   if (dump_enabled_p ())
     dump_printf (MSG_MISSED_OPTIMIZATION,
-                 "interleave op not supported by target.\n");
+		 "permutaion op not supported by target.\n");
   return false;
 }
 
@@ -4442,9 +4496,9 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_store_chain.
 
    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
-   a power of 2, generate interleave_high/low stmts to reorder the data
-   correctly for the stores.  Return the final references for stores in
-   RESULT_CHAIN.
+   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
+   the data correctly for the stores.  Return the final references for stores
+   in RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements.  We assign a number to
@@ -4511,7 +4565,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
   gimple perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   tree perm_mask_low, perm_mask_high;
-  unsigned int i, n;
+  tree data_ref;
+  tree perm3_mask_low, perm3_mask_high;
+  unsigned int i, n, log_length = exact_log2 (length);
   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
@@ -4519,47 +4575,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
 	  length * sizeof (tree));
 
-  for (i = 0, n = nelt / 2; i < n; i++)
+  if (length == 3)
     {
-      sel[i * 2] = i;
-      sel[i * 2 + 1] = i + nelt;
-    }
-  perm_mask_high = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_high != NULL);
+      unsigned int j0 = 0, j1 = 0, j2 = 0;
 
-  for (i = 0; i < nelt; i++)
-    sel[i] += nelt / 2;
-  perm_mask_low = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_low != NULL);
+      for (j = 0; j < 3; j++)
+        {
+	  int nelt0 = ((3 - j) * nelt) % 3;
+	  int nelt1 = ((3 - j) * nelt + 1) % 3;
+	  int nelt2 = ((3 - j) * nelt + 2) % 3;
 
-  for (i = 0, n = exact_log2 (length); i < n; i++)
-    {
-      for (j = 0; j < length/2; j++)
-	{
-	  vect1 = dr_chain[j];
-	  vect2 = dr_chain[j+length/2];
+	  for (i = 0; i < nelt; i++)
+	    {
+	      if (3 * i + nelt0 < nelt)
+		sel[3 * i + nelt0] = j0++;
+	      if (3 * i + nelt1 < nelt)
+		sel[3 * i + nelt1] = nelt + j1++;
+	      if (3 * i + nelt2 < nelt)
+		sel[3 * i + nelt2] = 0;
+	    }
+	  perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_low != NULL);
+
+	  for (i = 0; i < nelt; i++)
+	    {
+	      if (3 * i + nelt0 < nelt)
+		sel[3 * i + nelt0] = 3 * i + nelt0;
+	      if (3 * i + nelt1 < nelt)
+		sel[3 * i + nelt1] = 3 * i + nelt1;
+	      if (3 * i + nelt2 < nelt)
+		sel[3 * i + nelt2] = nelt + j2++;
+	    }
+	  perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_high != NULL);
+
+	  vect1 = dr_chain[0];
+	  vect2 = dr_chain[1];
 
 	  /* Create interleaving stmt:
-	     high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
-	  high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
-	  perm_stmt
-	    = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
-					    vect1, vect2, perm_mask_high);
+	     low = VEC_PERM_EXPR <vect1, vect2,
+				  {j, nelt, *, j + 1, nelt + j + 1, *,
+				   j + 2, nelt + j + 2, *, ...}>  */
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect1, vect2,
+						    perm3_mask_low);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[2*j] = high;
 
+	  vect1 = data_ref;
+	  vect2 = dr_chain[2];
 	  /* Create interleaving stmt:
-	     low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
-						 nelt*3/2+1, ...}>  */
-	  low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
-	  perm_stmt
-	    = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
-					    vect1, vect2, perm_mask_low);
+	     low = VEC_PERM_EXPR <vect1, vect2,
+				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
+				   6, 7, nelt + j + 2, ...}>  */
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect1, vect2,
+						    perm3_mask_high);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[2*j+1] = low;
+	  (*result_chain)[j] = data_ref;
 	}
-      memcpy (dr_chain.address (), result_chain->address (),
-	      length * sizeof (tree));
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0, n = nelt / 2; i < n; i++)
+	{
+	  sel[i * 2] = i;
+	  sel[i * 2 + 1] = i + nelt;
+	}
+	perm_mask_high = vect_gen_perm_mask (vectype, sel);
+	gcc_assert (perm_mask_high != NULL);
+
+	for (i = 0; i < nelt; i++)
+	  sel[i] += nelt / 2;
+	perm_mask_low = vect_gen_perm_mask (vectype, sel);
+	gcc_assert (perm_mask_low != NULL);
+
+	for (i = 0, n = log_length; i < n; i++)
+	  {
+	    for (j = 0; j < length/2; j++)
+	      {
+		vect1 = dr_chain[j];
+		vect2 = dr_chain[j+length/2];
+
+		/* Create interleaving stmt:
+		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
+							...}>  */
+		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+		perm_stmt
+		  = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
+						  vect1, vect2, perm_mask_high);
+		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+		(*result_chain)[2*j] = high;
+
+		/* Create interleaving stmt:
+		   low = VEC_PERM_EXPR <vect1, vect2,
+					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
+					 ...}>  */
+		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+		perm_stmt
+		  = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
+						  vect1, vect2, perm_mask_low);
+		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+		(*result_chain)[2*j+1] = low;
+	      }
+	    memcpy (dr_chain.address (), result_chain->address (),
+		    length * sizeof (tree));
+	  }
     }
 }
 
@@ -4838,36 +4963,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
   enum machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_load_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_load_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+			 "the size of the group of accesses"
+			 " is not a power of 2 or not equal to 3\n");
       return false;
     }
 
   /* Check that the permutation is supported.  */
   if (VECTOR_MODE_P (mode))
     {
-      unsigned int i, nelt = GET_MODE_NUNITS (mode);
+      unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
-      for (i = 0; i < nelt; i++)
-	sel[i] = i * 2;
-      if (can_vec_perm_p (mode, false, sel))
+      if (count == 3)
 	{
+	  unsigned int k;
+	  for (k = 0; k < 3; k++)
+	    {
+	      for (i = 0; i < nelt; i++)
+		if (3 * i + k < 2 * nelt)
+		  sel[i] = 3 * i + k;
+		else
+		  sel[i] = 0;
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "shuffle of 3 loads is not supported by"
+				     " target\n");
+		    return false;
+		}
+	      for (i = 0, j = 0; i < nelt; i++)
+		if (3 * i + k < 2 * nelt)
+		  sel[i] = i;
+		else
+		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+	      if (!can_vec_perm_p (mode, false, sel))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "shuffle of 3 loads is not supported by"
+				     " target\n");
+		  return false;
+		}
+	    }
+	  return true;
+	}
+      else
+	{
+	  /* If length is not equal to 3 then only power of 2 is supported.  */
+	  gcc_assert (exact_log2 (count) != -1);
 	  for (i = 0; i < nelt; i++)
-	    sel[i] = i * 2 + 1;
+	    sel[i] = i * 2;
 	  if (can_vec_perm_p (mode, false, sel))
-	    return true;
-	}
+	    {
+	      for (i = 0; i < nelt; i++)
+		sel[i] = i * 2 + 1;
+	      if (can_vec_perm_p (mode, false, sel))
+		return true;
+	    }
+        }
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                     "extract even/odd not supported by target\n");
+		     "extract even/odd not supported by target\n");
   return false;
 }
 
@@ -4885,8 +5050,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_load_chain.
 
    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
-   a power of 2, generate extract_even/odd stmts to reorder the input data
-   correctly.  Return the final references for loads in RESULT_CHAIN.
+   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+   the input data correctly.  Return the final references for loads in
+   RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4967,6 +5133,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
 {
   tree data_ref, first_vect, second_vect;
   tree perm_mask_even, perm_mask_odd;
+  tree perm3_mask_low, perm3_mask_high;
   gimple perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   unsigned int i, j, log_length = exact_log2 (length);
@@ -4977,44 +5144,437 @@ vect_permute_load_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
 	  length * sizeof (tree));
 
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2;
-  perm_mask_even = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_even != NULL);
-
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2 + 1;
-  perm_mask_odd = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_odd != NULL);
-
-  for (i = 0; i < log_length; i++)
+  if (length == 3)
     {
-      for (j = 0; j < length; j += 2)
-	{
-	  first_vect = dr_chain[j];
-	  second_vect = dr_chain[j+1];
+      unsigned int k;
 
-	  /* data_ref = permute_even (first_data_ref, second_data_ref);  */
-	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+      for (k = 0; k < 3; k++)
+	{
+	  for (i = 0; i < nelt; i++)
+	    if (3 * i + k < 2 * nelt)
+	      sel[i] = 3 * i + k;
+	    else
+	      sel[i] = 0;
+	  perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_low != NULL);
+
+	  for (i = 0, j = 0; i < nelt; i++)
+	    if (3 * i + k < 2 * nelt)
+	      sel[i] = i;
+	    else
+	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+	  perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm3_mask_high != NULL);
+
+	  first_vect = dr_chain[0];
+	  second_vect = dr_chain[1];
+
+	  /* Create interleaving stmt (low part of):
+	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+							     ...}>  */
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
 						    first_vect, second_vect,
-						    perm_mask_even);
+						    perm3_mask_low);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[j/2] = data_ref;
 
-	  /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
-	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+	  /* Create interleaving stmt (high part of):
+	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+							      ...}>  */
+	  first_vect = data_ref;
+	  second_vect = dr_chain[2];
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
 	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
 						    first_vect, second_vect,
-						    perm_mask_odd);
+						    perm3_mask_high);
 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	  (*result_chain)[j/2+length/2] = data_ref;
+	  (*result_chain)[k] = data_ref;
+	}
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0; i < nelt; ++i)
+	sel[i] = i * 2;
+      perm_mask_even = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_even != NULL);
+
+      for (i = 0; i < nelt; ++i)
+	sel[i] = i * 2 + 1;
+      perm_mask_odd = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_odd != NULL);
+
+      for (i = 0; i < log_length; i++)
+	{
+	  for (j = 0; j < length; j += 2)
+	    {
+	      first_vect = dr_chain[j];
+	      second_vect = dr_chain[j+1];
+
+	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
+	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+	      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+							first_vect, second_vect,
+							perm_mask_even);
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[j/2] = data_ref;
+
+	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
+	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+	      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+							first_vect, second_vect,
+							perm_mask_odd);
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[j/2+length/2] = data_ref;
+	    }
+	  memcpy (dr_chain.address (), result_chain->address (),
+		  length * sizeof (tree));
 	}
-      memcpy (dr_chain.address (), result_chain->address (),
-	      length * sizeof (tree));
     }
 }
 
+/* Function vect_shift_permute_load_chain.
+
+   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+   sequence of stmts to reorder the input data accordingly.
+   Return the final references for loads in RESULT_CHAIN.
+   Return true if successed, false otherwise.
+
+   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+   The input is 3 vectors each containing 8 elements.  We assign a
+   number to each element, the input sequence is:
+
+   1st vec:   0  1  2  3  4  5  6  7
+   2nd vec:   8  9 10 11 12 13 14 15
+   3rd vec:  16 17 18 19 20 21 22 23
+
+   The output sequence should be:
+
+   1st vec:  0 3 6  9 12 15 18 21
+   2nd vec:  1 4 7 10 13 16 19 22
+   3rd vec:  2 5 8 11 14 17 20 23
+
+   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+   First we shuffle all 3 vectors to get correct elements order:
+
+   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
+   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
+   3rd vec:  (16 19 22) (17 20 23) (18 21)
+
+   Next we unite and shift vector 3 times:
+
+   1st step:
+     shift right by 6 the concatenation of:
+     "1st vec" and  "2nd vec"
+       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+     "2nd vec" and  "3rd vec"
+       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+     "3rd vec" and  "1st vec"
+       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
+			     | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
+     2nd vec:  (10 13) (16 19 22) (17 20 23)
+     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
+
+   2nd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "3rd vec"
+       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
+     "2nd vec" and  "1st vec"
+       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
+     "3rd vec" and  "2nd vec"
+       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
+			  | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
+     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
+
+   3rd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "1st vec"
+       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
+     shift right by 3 the concatenation of:
+     "2nd vec" and  "2nd vec"
+               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
+			  | New vectors                   |
+
+     So that now all vectors are READY:
+     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
+     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
+
+   This algorithm is faster than one in vect_permute_load_chain if:
+     1.  "shift of a concatination" is faster than general permutation.
+	 This is usually so.
+     2.  The TARGET machine can't execute vector instructions in parallel.
+	 This is because each step of the algorithm depends on previous.
+	 The algorithm in vect_permute_load_chain is much more parallel.
+
+   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+			       unsigned int length,
+			       gimple stmt,
+			       gimple_stmt_iterator *gsi,
+			       vec<tree> *result_chain)
+{
+  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+  tree perm2_mask1, perm2_mask2, perm3_mask;
+  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+  gimple perm_stmt;
+
+  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+  unsigned int i;
+  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+  result_chain->quick_grow (length);
+  memcpy (result_chain->address (), dr_chain.address (),
+	  length * sizeof (tree));
+
+  if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+    {
+      for (i = 0; i < nelt / 2; ++i)
+	sel[i] = i * 2;
+      for (i = 0; i < nelt / 2; ++i)
+	sel[nelt / 2 + i] = i * 2 + 1;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 2 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm2_mask1 = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm2_mask1 != NULL);
+
+      for (i = 0; i < nelt / 2; ++i)
+	sel[i] = i * 2 + 1;
+      for (i = 0; i < nelt / 2; ++i)
+	sel[nelt / 2 + i] = i * 2;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 2 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm2_mask2 = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm2_mask2 != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = nelt / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift1_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift1_mask != NULL);
+
+      /* Generating permutation constant to select vector from 2.
+	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
+      for (i = 0; i < nelt / 2; i++)
+	sel[i] = i;
+      for (i = nelt / 2; i < nelt; i++)
+	sel[i] = nelt + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "select is not supported by target\n");
+	  return false;
+	}
+      select_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (select_mask != NULL);
+
+      first_vect = dr_chain[0];
+      second_vect = dr_chain[1];
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						first_vect, first_vect,
+						perm2_mask1);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      vect[0] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						second_vect, second_vect,
+						perm2_mask2);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      vect[1] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[1],
+						shift1_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[1] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[1],
+						select_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[0] = data_ref;
+
+      return true;
+    }
+  if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
+    {
+      unsigned int k = 0, l = 0;
+
+      /* Generating permutation constant to get all elements in rigth order.
+	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
+      for (i = 0; i < nelt; i++)
+	{
+	  if (3 * k + (l % 3) >= nelt)
+	    {
+	      k = 0;
+	      l += (3 - (nelt % 3));
+	    }
+	  sel[i] = 3 * k + (l % 3);
+	  k++;
+	}
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shuffle of 3 fields structure is not \
+			      supported by target\n");
+	  return false;
+	}
+      perm3_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm3_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift1_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift1_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + 1 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift2_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift2_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift3_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift3_mask != NULL);
+
+      /* Generating permutation constant to shift all elements.
+	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "shift permutation is not supported by target\n");
+	  return false;
+	}
+      shift4_mask = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (shift4_mask != NULL);
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    dr_chain[k], dr_chain[k],
+						    perm3_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect[k] = data_ref;
+	}
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect[k % 3],
+						    vect[(k + 1) % 3],
+						    shift1_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect_shift[k] = data_ref;
+	}
+
+      for (k = 0; k < 3; k++)
+	{
+	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+	  perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						    vect_shift[(4 - k) % 3],
+						    vect_shift[(3 - k) % 3],
+						    shift2_mask);
+	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	  vect[k] = data_ref;
+	}
+
+      (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[0], vect[0],
+						shift3_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[nelt % 3] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+						vect[1], vect[1],
+						shift4_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[0] = data_ref;
+      return true;
+    }
+  return false;
+}
 
 /* Function vect_transform_grouped_load.
 
@@ -5027,13 +5587,23 @@ void
 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
 			     gimple_stmt_iterator *gsi)
 {
+  enum machine_mode mode;
   vec<tree> result_chain = vNULL;
 
   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
      vectors, that are ready for vector computation.  */
   result_chain.create (size);
-  vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+  /* If reassociation width for vector type is 2 or greater target machine can
+     execute 2 or more vector instructions in parallel.  Otherwise try to
+     get chain for loads group using vect_shift_permute_load_chain.  */
+  mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+      || exact_log2 (size) != -1
+      || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+					 gsi, &result_chain))
+    vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
   vect_record_grouped_load_vectors (stmt, result_chain);
   result_chain.release ();
 }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7c4575d391f..7d536613655 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -975,9 +975,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
      include the cost of the permutes.  */
   if (!store_lanes_p && group_size > 1)
     {
-      /* Uses a high and low interleave operation for each needed permute.  */
-      
-      int nstmts = ncopies * exact_log2 (group_size) * group_size;
+      /* Uses a high and low interleave or shuffle operations for each
+	 needed permute.  */
+      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
       inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
 				      stmt_info, 0, vect_body);
 
@@ -1092,10 +1092,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
      include the cost of the permutes.  */
   if (!load_lanes_p && group_size > 1)
     {
-      /* Uses an even and odd extract operations for each needed permute.  */
-      int nstmts = ncopies * exact_log2 (group_size) * group_size;
-      inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
-				       stmt_info, 0, vect_body);
+      /* Uses an even and odd extract operations or shuffle operations
+	 for each needed permute.  */
+      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
+      inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+				      stmt_info, 0, vect_body);
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 2f93c7e9593..fb3e05d85c2 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -9730,7 +9730,7 @@ vrp_finalize (void)
   substitute_and_fold (op_with_constant_singleton_value_range,
 		       vrp_fold_stmt, false);
 
-  if (warn_array_bounds)
+  if (warn_array_bounds && first_pass_instance)
     check_all_array_refs ();
 
   /* We must identify jump threading opportunities before we release
diff --git a/gcc/tree.h b/gcc/tree.h
index f13366f9e29..4c465f437cd 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1080,6 +1080,11 @@ extern void protected_set_expr_location (tree, location_t);
 #define TMR_STEP(NODE) (TREE_OPERAND (TARGET_MEM_REF_CHECK (NODE), 3))
 #define TMR_INDEX2(NODE) (TREE_OPERAND (TARGET_MEM_REF_CHECK (NODE), 4))
 
+#define MR_DEPENDENCE_CLIQUE(NODE) \
+  (TREE_CHECK2 (NODE, MEM_REF, TARGET_MEM_REF)->base.u.dependence_info.clique)
+#define MR_DEPENDENCE_BASE(NODE) \
+  (TREE_CHECK2 (NODE, MEM_REF, TARGET_MEM_REF)->base.u.dependence_info.base)
+
 /* The operands of a BIND_EXPR.  */
 #define BIND_EXPR_VARS(NODE) (TREE_OPERAND (BIND_EXPR_CHECK (NODE), 0))
 #define BIND_EXPR_BODY(NODE) (TREE_OPERAND (BIND_EXPR_CHECK (NODE), 1))
diff --git a/gcc/value-prof.c b/gcc/value-prof.c
index 15c2d233b59..d74c112ee88 100644
--- a/gcc/value-prof.c
+++ b/gcc/value-prof.c
@@ -697,20 +697,12 @@ check_ic_counter (gimple stmt, gcov_type *count1, gcov_type *count2,
 
   if (*count1 + *count2 > all)
     {
-      /* If (COUNT1 + COUNT2) is greater than ALL by less than around 10% then
-	 just fix COUNT2 up so that (COUNT1 + COUNT2) equals ALL.  */
-      if ((*count1 + *count2 - all) < (all >> 3))
-	*count2 = all - *count1;
-      else
-	{
-          if (dump_enabled_p ())
-            dump_printf_loc (MSG_MISSED_OPTIMIZATION, locus,
-                             "Corrupted topn ic value profile: top two "
-                             "targets's total count (%ld) exceeds bb count "
-                             "(%ld)",
-                             (long)(*count1 + *count2), (long)all);
-	  return true;
-	}
+      /* If (COUNT1 + COUNT2) is greater than ALL, we will fix it. This might
+         not necessarily be a corrupted profile. It may be caused by the sample
+         scaling. We will scale down both count1 and counte2.  */
+      double factor = (double) all / (*count1 + *count2);
+      *count1 *= factor;
+      *count2 *= factor;
     }
   return false;
 }
diff --git a/gcc/var-tracking.c b/gcc/var-tracking.c
index 65d82854c0a..555344144a2 100644
--- a/gcc/var-tracking.c
+++ b/gcc/var-tracking.c
@@ -5997,7 +5997,8 @@ add_stores (rtx loc, const_rtx expr, void *cuip)
     {
       cselib_val *oval = cselib_lookup (oloc, GET_MODE (oloc), 0, VOIDmode);
 
-      gcc_assert (oval != v);
+      if (oval == v)
+	return;
       gcc_assert (REG_P (oloc) || MEM_P (oloc));
 
       if (oval && !cselib_preserved_value_p (oval))
diff --git a/gcc/varasm.c b/gcc/varasm.c
index 9e8afa9a2ec..163814053b1 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -169,7 +169,7 @@ bool in_cold_section_p;
    and there was actually code that went into the cold section.  A
    pseudo function name is needed for the cold section of code for some
    debugging tools that perform symbolization. */
-tree cold_partition_name = NULL_TREE;
+tree cold_function_name = NULL_TREE;
 
 /* A linked list of all the unnamed sections.  */
 static GTY(()) section *unnamed_sections;
@@ -1626,7 +1626,7 @@ assemble_start_function (tree decl, const char *fnname)
       ASM_GENERATE_INTERNAL_LABEL (tmp_label, "LCOLDE", const_labelno);
       crtl->subsections.cold_section_end_label = ggc_strdup (tmp_label);
       const_labelno++;
-      cold_partition_name = NULL_TREE;
+      cold_function_name = NULL_TREE;
     }
   else
     {
@@ -1759,10 +1759,10 @@ assemble_end_function (tree decl, const char *fnname ATTRIBUTE_UNUSED)
 
       save_text_section = in_section;
       switch_to_section (unlikely_text_section ());
-      if (cold_partition_name != NULL_TREE)
-	ASM_DECLARE_FUNCTION_SIZE (asm_out_file,
-				   IDENTIFIER_POINTER (cold_partition_name),
-				   decl);
+      if (cold_function_name != NULL_TREE)
+       ASM_DECLARE_FUNCTION_SIZE (asm_out_file,
+                                  IDENTIFIER_POINTER (cold_function_name),
+                                  decl);
       ASM_OUTPUT_LABEL (asm_out_file, crtl->subsections.cold_section_end_label);
       if (first_function_block_is_cold)
 	switch_to_section (text_section);
diff --git a/gcc/varasm.h b/gcc/varasm.h
index 2e59d5f47e0..d2ceec1d2d2 100644
--- a/gcc/varasm.h
+++ b/gcc/varasm.h
@@ -25,7 +25,7 @@ along with GCC; see the file COPYING3.  If not see
    and there was actually code that went into the cold section.  A
    pseudo function name is needed for the cold section of code for some
    debugging tools that perform symbolization. */
-extern tree cold_partition_name;
+extern tree cold_function_name;
 
 extern tree tree_output_constant_def (tree);
 extern void make_decl_rtl (tree);
diff --git a/gcc/web.c b/gcc/web.c
index d67151c7139..7ee39a19737 100644
--- a/gcc/web.c
+++ b/gcc/web.c
@@ -53,17 +53,17 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Find the root of unionfind tree (the representative of set).  */
 
-struct web_entry *
-unionfind_root (struct web_entry *element)
+web_entry_base *
+web_entry_base::unionfind_root ()
 {
-  struct web_entry *element1 = element, *element2;
+  web_entry_base *element = this, *element1 = this, *element2;
 
-  while (element->pred)
-    element = element->pred;
-  while (element1->pred)
+  while (element->pred ())
+    element = element->pred ();
+  while (element1->pred ())
     {
-      element2 = element1->pred;
-      element1->pred = element;
+      element2 = element1->pred ();
+      element1->set_pred (element);
       element1 = element2;
     }
   return element;
@@ -74,23 +74,32 @@ unionfind_root (struct web_entry *element)
    nothing is done.  Otherwise, return false.  */
 
 bool
-unionfind_union (struct web_entry *first, struct web_entry *second)
+unionfind_union (web_entry_base *first, web_entry_base *second)
 {
-  first = unionfind_root (first);
-  second = unionfind_root (second);
+  first = first->unionfind_root ();
+  second = second->unionfind_root ();
   if (first == second)
     return true;
-  second->pred = first;
+  second->set_pred (first);
   return false;
 }
 
+class web_entry : public web_entry_base
+{
+ private:
+  rtx reg_pvt;
+
+ public:
+  rtx reg () { return reg_pvt; }
+  void set_reg (rtx r) { reg_pvt = r; }
+};
+
 /* For INSN, union all defs and uses that are linked by match_dup.
    FUN is the function that does the union.  */
 
 static void
-union_match_dups (rtx insn, struct web_entry *def_entry,
-		  struct web_entry *use_entry,
-		  bool (*fun) (struct web_entry *, struct web_entry *))
+union_match_dups (rtx insn, web_entry *def_entry, web_entry *use_entry,
+		  bool (*fun) (web_entry_base *, web_entry_base *))
 {
   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
   df_ref *use_link = DF_INSN_INFO_USES (insn_info);
@@ -167,9 +176,9 @@ union_match_dups (rtx insn, struct web_entry *def_entry,
    the values 0 and 1 are reserved for use by entry_register.  */
 
 void
-union_defs (df_ref use, struct web_entry *def_entry,
-	    unsigned int *used, struct web_entry *use_entry,
- 	    bool (*fun) (struct web_entry *, struct web_entry *))
+union_defs (df_ref use, web_entry *def_entry,
+	    unsigned int *used, web_entry *use_entry,
+ 	    bool (*fun) (web_entry_base *, web_entry_base *))
 {
   struct df_insn_info *insn_info = DF_REF_INSN_INFO (use);
   struct df_link *link = DF_REF_CHAIN (use);
@@ -270,15 +279,15 @@ union_defs (df_ref use, struct web_entry *def_entry,
 /* Find the corresponding register for the given entry.  */
 
 static rtx
-entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
+entry_register (web_entry *entry, df_ref ref, unsigned int *used)
 {
-  struct web_entry *root;
+  web_entry *root;
   rtx reg, newreg;
 
   /* Find the corresponding web and see if it has been visited.  */
-  root = unionfind_root (entry);
-  if (root->reg)
-    return root->reg;
+  root = (web_entry *)entry->unionfind_root ();
+  if (root->reg ())
+    return root->reg ();
 
   /* We are seeing this web for the first time, do the assignment.  */
   reg = DF_REF_REAL_REG (ref);
@@ -302,7 +311,7 @@ entry_register (struct web_entry *entry, df_ref ref, unsigned int *used)
 		 REGNO (newreg));
     }
 
-  root->reg = newreg;
+  root->set_reg (newreg);
   return newreg;
 }
 
@@ -336,8 +345,8 @@ gate_handle_web (void)
 static unsigned int
 web_main (void)
 {
-  struct web_entry *def_entry;
-  struct web_entry *use_entry;
+  web_entry *def_entry;
+  web_entry *use_entry;
   unsigned int max = max_reg_num ();
   unsigned int *used;
   basic_block bb;
@@ -374,9 +383,9 @@ web_main (void)
     }
 
   /* Record the number of uses and defs at the beginning of the optimization.  */
-  def_entry = XCNEWVEC (struct web_entry, DF_DEFS_TABLE_SIZE ());
+  def_entry = XCNEWVEC (web_entry, DF_DEFS_TABLE_SIZE ());
   used = XCNEWVEC (unsigned, max);
-  use_entry = XCNEWVEC (struct web_entry, uses_num);
+  use_entry = XCNEWVEC (web_entry, uses_num);
 
   /* Produce the web.  */
   FOR_ALL_BB_FN (bb, cfun)
diff --git a/libgcc/config.host b/libgcc/config.host
index e520441c582..896a31d17e5 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -812,8 +812,10 @@ mips*-sde-elf*)
 	;;
 mipsisa32-*-elf* | mipsisa32el-*-elf* | \
 mipsisa32r2-*-elf* | mipsisa32r2el-*-elf* | \
+mipsisa32r6-*-elf* | mipsisa32r6el-*-elf* | \
 mipsisa64-*-elf* | mipsisa64el-*-elf* | \
-mipsisa64r2-*-elf* | mipsisa64r2el-*-elf*)
+mipsisa64r2-*-elf* | mipsisa64r2el-*-elf* | \
+mipsisa64r6-*-elf* | mipsisa64r6el-*-elf*)
 	tmake_file="$tmake_file mips/t-elf mips/t-crtstuff mips/t-mips16"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
@@ -1245,7 +1247,7 @@ i[34567]86-*-linux* | x86_64-*-linux* | \
   i[34567]86-*-kfreebsd*-gnu | x86_64-*-kfreebsd*-gnu | \
   i[34567]86-*-knetbsd*-gnu | \
   i[34567]86-*-gnu*)
-	tmake_file="${tmake_file} t-tls i386/t-linux"
+	tmake_file="${tmake_file} t-tls i386/t-linux t-slibgcc-libgcc"
 	if test "$libgcc_cv_cfi" = "yes"; then
 		tmake_file="${tmake_file} t-stack i386/t-stack-i386"
 	fi
diff --git a/libgcc/config/i386/cpuinfo.c b/libgcc/config/i386/cpuinfo.c
index 6ff7502bb9a..3cfda027097 100644
--- a/libgcc/config/i386/cpuinfo.c
+++ b/libgcc/config/i386/cpuinfo.c
@@ -34,6 +34,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #endif
 
 int __cpu_indicator_init (void)
+#if !defined(SHARED)
+  __attribute__ ((visibility("hidden")))
+#endif
   __attribute__ ((constructor CONSTRUCTOR_PRIORITY));
 
 /* Processor Vendor and Models. */
@@ -99,13 +102,16 @@ enum processor_features
   FEATURE_FMA
 };
 
+#if !defined(SHARED)
+  __attribute__ ((visibility("hidden")))
+#endif
 struct __processor_model
 {
   unsigned int __cpu_vendor;
   unsigned int __cpu_type;
   unsigned int __cpu_subtype;
   unsigned int __cpu_features[1];
-} __cpu_model;
+} __cpu_model = { };
 
 
 /* Get the specific type of AMD CPU.  */
@@ -321,6 +327,9 @@ __get_cpuid_output (unsigned int __level,
    needs to be called explicitly there.  */
 
 int __attribute__ ((constructor CONSTRUCTOR_PRIORITY))
+#if !defined(SHARED)
+  __attribute__ ((visibility("hidden")))
+#endif
 __cpu_indicator_init (void)
 {
   unsigned int eax, ebx, ecx, edx;
@@ -403,3 +412,8 @@ __cpu_indicator_init (void)
 
   return 0;
 }
+
+#if defined SHARED && defined USE_ELF_SYMVER
+__asm__ (".symver __cpu_indicator_init, __cpu_indicator_init@GCC_4.8.0");
+__asm__ (".symver __cpu_model, __cpu_model@GCC_4.8.0");
+#endif
diff --git a/libgcc/config/i386/t-linux b/libgcc/config/i386/t-linux
index 4f47f7bfa59..12aab16b6f1 100644
--- a/libgcc/config/i386/t-linux
+++ b/libgcc/config/i386/t-linux
@@ -3,4 +3,8 @@
 # t-slibgcc-elf-ver and t-linux
 SHLIB_MAPFILES = libgcc-std.ver $(srcdir)/config/i386/libgcc-glibc.ver
 
-HOST_LIBGCC2_CFLAGS += -mlong-double-80
+# Work around gold bug:
+# https://sourceware.org/bugzilla/show_bug.cgi?id=18703
+SHLIB_LDFLAGS += -fuse-ld=bfd
+
+HOST_LIBGCC2_CFLAGS += -mlong-double-80 -DUSE_ELF_SYMVER
diff --git a/libgcc/config/mips/mips16.S b/libgcc/config/mips/mips16.S
index 6a43a9839e7..1783d1178dc 100644
--- a/libgcc/config/mips/mips16.S
+++ b/libgcc/config/mips/mips16.S
@@ -21,10 +21,27 @@ a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
-#ifdef __mips_micromips
-  /* DO NOTHING */
+#include "auto-host.h"
+
+#if defined(__mips_micromips) || defined(__mips_soft_float) \
+    || __mips_isa_rev >= 6
+  /* Do nothing because this code is only needed when linking
+     against mips16 hard-float objects.  Neither micromips code
+     nor soft-float nor MIPS R6 code can be linked against mips16
+     hard-float objects so we do not need these routines when
+     building libgcc for those cases.  */
 #else
 
+#if defined(HAVE_AS_MODULE)
+#if __mips_fpr == 32
+	.module fp=32
+#elif __mips_fpr == 0
+	.module fp=xx
+#elif __mips_fpr == 64
+	.module fp=64
+#endif
+#endif
+
 /* This file contains mips16 floating point support functions.  These
    functions are called by mips16 code to handle floating point when
    -msoft-float is not used.  They accept the arguments and return
@@ -148,8 +165,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 /* The high 32 bits of $2 correspond to the second word in memory;
    i.e. the imaginary part.  */
 #define MOVE_SC_RET(D, T) MERGE_GPR##D ($2, $f1, $f0); jr T
-#elif __mips_fpr == 64
-#define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f1)
 #else
 #define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f2)
 #endif
@@ -170,16 +185,29 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define MOVE_DF_BYTE8(D) dm##D##c1 $5,$f13
 #define MOVE_DF_RET(D, T) DELAY##D (T, dm##D##c1 $2,$f0)
 #define MOVE_DC_RET(D, T) dm##D##c1 $3,$f1; MOVE_DF_RET (D, T)
-#elif __mips_fpr == 64 && defined(__MIPSEB__)
+#elif __mips_fpr != 32 && __mips_isa_rev >= 2 && defined(__MIPSEB__)
 #define MOVE_DF_BYTE0(D) m##D##c1 $5,$f12; m##D##hc1 $4,$f12
 #define MOVE_DF_BYTE8(D) m##D##c1 $7,$f14; m##D##hc1 $6,$f14
 #define MOVE_DF_RET(D, T) m##D##c1 $3,$f0; DELAY##D (T, m##D##hc1 $2,$f0)
-#define MOVE_DC_RET(D, T) m##D##c1 $5,$f1; m##D##hc1 $4,$f1; MOVE_DF_RET (D, T)
-#elif __mips_fpr == 64
+#define MOVE_DC_RET(D, T) m##D##c1 $5,$f2; m##D##hc1 $4,$f2; MOVE_DF_RET (D, T)
+#elif __mips_fpr != 32 && __mips_isa_rev >= 2
 #define MOVE_DF_BYTE0(D) m##D##c1 $4,$f12; m##D##hc1 $5,$f12
 #define MOVE_DF_BYTE8(D) m##D##c1 $6,$f14; m##D##hc1 $7,$f14
 #define MOVE_DF_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##hc1 $3,$f0)
-#define MOVE_DC_RET(D, T) m##D##c1 $4,$f1; m##D##hc1 $5,$f1; MOVE_DF_RET (D, T)
+#define MOVE_DC_RET(D, T) m##D##c1 $4,$f2; m##D##hc1 $5,$f2; MOVE_DF_RET (D, T)
+#elif __mips_fpr == 0
+#define MOVE_DF_BYTE0t sw $4, 0($29); sw $5, 4($29); ldc1 $f12, 0($29)
+#define MOVE_DF_BYTE0f sdc1 $f12, 0($29); lw $4, 0($29); lw $5, 4($29)
+#define MOVE_DF_BYTE0(D) MOVE_DF_BYTE0##D
+#define MOVE_DF_BYTE8t sw $6, 8($29); sw $7, 12($29); ldc1 $f14, 8($29)
+#define MOVE_DF_BYTE8f sdc1 $f14, 8($29); lw $6, 8($29); lw $7, 12($29)
+#define MOVE_DF_BYTE8(D) MOVE_DF_BYTE8##D
+#define MOVE_DF_RETt(T) sw $2, 0($29); sw $3, 4($29); DELAYt (T, ldc1 $f0, 0($29))
+#define MOVE_DF_RETf(T) sdc1 $f0, 0($29); lw $2, 0($29); DELAYf (T, lw $3, 4($29))
+#define MOVE_DF_RET(D, T) MOVE_DF_RET##D(T)
+#define MOVE_DC_RETt(T) sw $4, 8($29); sw $5, 12($29); ldc1 $f2, 8($29); MOVE_DF_RETt(T)
+#define MOVE_DC_RETf(T) sdc1 $f2, 8($29); lw $4, 8($29); lw $5, 12($29); MOVE_DF_RETf(T)
+#define MOVE_DC_RET(D, T) MOVE_DF_RET##D(T)
 #elif defined(__MIPSEB__)
 /* FPRs are little-endian.  */
 #define MOVE_DF_BYTE0(D) m##D##c1 $4,$f13; m##D##c1 $5,$f12
@@ -749,4 +777,4 @@ CALL_STUB_RET (__mips16_call_stub_dc_10, 10, DC)
 #endif /* !__mips_single_float */
 
 #endif
-#endif /* __mips_micromips */
+#endif /* defined(__mips_micromips) || defined(__mips_soft_float) */
diff --git a/libgcc/crtstuff.c b/libgcc/crtstuff.c
index 5664347f4da..29394cdbeb0 100644
--- a/libgcc/crtstuff.c
+++ b/libgcc/crtstuff.c
@@ -52,6 +52,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    identified the set of defines that need to go into auto-target.h,
    this will have to do.  */
 #include "auto-host.h"
+#undef caddr_t
 #undef pid_t
 #undef rlim_t
 #undef ssize_t
diff --git a/libgcc/libgcov-driver.c b/libgcc/libgcov-driver.c
index 8361d7ebbe0..3c569f171a8 100644
--- a/libgcc/libgcov-driver.c
+++ b/libgcc/libgcov-driver.c
@@ -998,6 +998,8 @@ gcov_exit_merge_summary (const struct gcov_info *gi_ptr, struct gcov_summary *pr
   return 0;
 }
 
+__attribute__((weak)) gcov_unsigned_t __gcov_lipo_sampling_period;
+
 /* Sort N entries in VALUE_ARRAY in descending order.
    Each entry in VALUE_ARRAY has two values. The sorting
    is based on the second value.  */
@@ -1069,6 +1071,42 @@ gcov_sort_topn_counter_arrays (const struct gcov_info *gi_ptr)
      }
 }
 
+/* Scaling LIPO sampled profile counters.  */
+static void
+gcov_scaling_lipo_counters (const struct gcov_info *gi_ptr)
+{
+  unsigned int i,j,k;
+  int f_ix;
+  const struct gcov_fn_info *gfi_ptr;
+  const struct gcov_ctr_info *ci_ptr;
+
+  if (__gcov_lipo_sampling_period <= 1)
+    return;
+
+  for (f_ix = 0; (unsigned)f_ix != gi_ptr->n_functions; f_ix++)
+    {
+      gfi_ptr = gi_ptr->functions[f_ix];
+      ci_ptr = gfi_ptr->ctrs;
+      for (i = 0; i < GCOV_COUNTERS; i++)
+        {
+          if (!gcov_counter_active (gi_ptr, i))
+            continue;
+          if (i == GCOV_COUNTER_ICALL_TOPNV)
+            {
+              for (j = 0; j < ci_ptr->num; j += GCOV_ICALL_TOPN_NCOUNTS)
+                for (k = 2; k < GCOV_ICALL_TOPN_NCOUNTS; k += 2)
+                  ci_ptr->values[j+k] *= __gcov_lipo_sampling_period;
+            }
+          if (i == GCOV_COUNTER_DIRECT_CALL)
+            {
+              for (j = 0; j < ci_ptr->num; j += 2)
+                ci_ptr->values[j+1] *= __gcov_lipo_sampling_period;
+            }
+          ci_ptr++;
+        }
+    }
+}
+
 /* Open a gcda file specified by GI_FILENAME.
    Return -1 on error.  Return 0 on success.  */
 
@@ -1116,6 +1154,7 @@ gcov_exit_dump_gcov (struct gcov_info *gi_ptr, struct gcov_filename_aux *gf,
   sum_buffer = 0;
 
   gcov_sort_topn_counter_arrays (gi_ptr);
+  gcov_scaling_lipo_counters (gi_ptr);
 
   error = gcov_exit_open_gcda_file (gi_ptr, gf);
   if (error == -1)
@@ -1360,6 +1399,13 @@ __gcov_init (struct gcov_info *info)
           if (env_value_int >= 1)
             __gcov_sampling_period = env_value_int;
         }
+      env_value_str = getenv ("GCOV_LIPO_SAMPLING_PERIOD");
+      if (env_value_str)
+        {
+          int env_value_int = atoi(env_value_str);
+          if (env_value_int >= 0)
+            __gcov_lipo_sampling_period = env_value_int;
+        }
       gcov_sampling_period_initialized = 1;
     }
 #endif
diff --git a/libgcc/libgcov-profiler.c b/libgcc/libgcov-profiler.c
index 3057b6157f9..7552adac8a3 100644
--- a/libgcc/libgcov-profiler.c
+++ b/libgcc/libgcov-profiler.c
@@ -221,13 +221,29 @@ __gcov_indirect_call_profiler_atomic_v2 (gcov_type value, void* cur_func)
      the descriptors to see if they point to the same function.  */
   if (cur_func == __gcov_indirect_call_callee
       || (VTABLE_USES_DESCRIPTORS && __gcov_indirect_call_callee
-	  && *(void **) cur_func == *(void **) __gcov_indirect_call_callee))
+          && *(void **) cur_func == *(void **) __gcov_indirect_call_callee))
     __gcov_one_value_profiler_body_atomic (__gcov_indirect_call_counters, value);
 }
 
 #endif
 
+/*
+#if defined(L_gcov_direct_call_profiler) || defined(L_gcov_indirect_call_topn_profiler)
+__attribute__ ((weak)) gcov_unsigned_t __gcov_lipo_sampling_period;
+#endif
+*/
+
+extern gcov_unsigned_t __gcov_lipo_sampling_period;
+
 #ifdef L_gcov_indirect_call_topn_profiler
+
+#include "gthr.h"
+
+#ifdef __GTHREAD_MUTEX_INIT
+__thread int in_profiler;
+ATTRIBUTE_HIDDEN __gthread_mutex_t __indir_topn_val_mx = __GTHREAD_MUTEX_INIT;
+#endif
+
 /* Tries to keep track the most frequent N values in the counters where
    N is specified by parameter TOPN_VAL. To track top N values, 2*N counter
    entries are used.
@@ -252,10 +268,18 @@ __gcov_topn_value_profiler_body (gcov_type *counters, gcov_type value,
 
    /* There are 2*topn_val values tracked, each value takes two slots in the
       counter array */
-   for ( i = 0; i < (topn_val << 2); i += 2)
+#ifdef __GTHREAD_MUTEX_INIT
+   /* If this is reentry, return.  */
+   if (in_profiler == 1)
+     return;
+
+   in_profiler = 1;
+   __gthread_mutex_lock (&__indir_topn_val_mx);
+#endif
+   for (i = 0; i < topn_val << 2; i += 2)
      {
        entry = &value_array[i];
-       if ( entry[0] == value)
+       if (entry[0] == value)
          {
            entry[1]++ ;
            found = 1;
@@ -271,7 +295,13 @@ __gcov_topn_value_profiler_body (gcov_type *counters, gcov_type value,
      }
 
    if (found)
-     return;
+     {
+       in_profiler = 0;
+#ifdef __GTHREAD_MUTEX_INIT
+       __gthread_mutex_unlock (&__indir_topn_val_mx);
+#endif
+       return;
+     }
 
    /* lfu_entry is either an empty entry or an entry
       with lowest count, which will be evicted.  */
@@ -280,56 +310,49 @@ __gcov_topn_value_profiler_body (gcov_type *counters, gcov_type value,
 
 #define GCOV_ICALL_COUNTER_CLEAR_THRESHOLD 3000
 
-   /* Too many evictions -- time to clear bottom entries to 
+   /* Too many evictions -- time to clear bottom entries to
       avoid hot values bumping each other out.  */
-   if ( !have_zero_count 
-        && ++*num_eviction >= GCOV_ICALL_COUNTER_CLEAR_THRESHOLD)
+   if (!have_zero_count
+       && ++*num_eviction >= GCOV_ICALL_COUNTER_CLEAR_THRESHOLD)
      {
        unsigned i, j;
-       gcov_type *p, minv;
-       gcov_type* tmp_cnts 
-           = (gcov_type *)alloca (topn_val * sizeof(gcov_type));
+       gcov_type **p;
+       gcov_type **tmp_cnts
+         = (gcov_type **)alloca (topn_val * sizeof(gcov_type *));
 
        *num_eviction = 0;
 
-       for ( i = 0; i < topn_val; i++ )
-         tmp_cnts[i] = 0;
-
        /* Find the largest topn_val values from the group of
-          2*topn_val values and put them into tmp_cnts. */
+          2*topn_val values and put the addresses into tmp_cnts.  */
+       for (i = 0; i < topn_val; i++)
+         tmp_cnts[i] = &value_array[i * 2 + 1];
 
-       for ( i = 0; i < 2 * topn_val; i += 2 ) 
+       for (i = topn_val * 2; i < topn_val << 2; i += 2)
          {
-           p = 0;
-           for ( j = 0; j < topn_val; j++ ) 
-             {
-               if ( !p || tmp_cnts[j] < *p ) 
-                  p = &tmp_cnts[j];
-             }
-            if ( value_array[i + 1] > *p )
-              *p = value_array[i + 1];
+           p = &tmp_cnts[0];
+           for (j = 1; j < topn_val; j++)
+             if (*tmp_cnts[j] > **p)
+               p = &tmp_cnts[j];
+           if (value_array[i + 1] < **p)
+             *p = &value_array[i + 1];
          }
 
-       minv = tmp_cnts[0];
-       for ( j = 1; j < topn_val; j++ )
-         {
-           if (tmp_cnts[j] < minv)
-             minv = tmp_cnts[j];
-         }
-       /* Zero out low value entries  */
-       for ( i = 0; i < 2 * topn_val; i += 2 )
+       /* Zero out low value entries.  */
+       for (i = 0; i < topn_val; i++)
          {
-           if (value_array[i + 1] < minv) 
-             {
-               value_array[i] = 0;
-               value_array[i + 1] = 0;
-             }
+           *tmp_cnts[i] = 0;
+           *(tmp_cnts[i] - 1) = 0;
          }
      }
+
+#ifdef __GTHREAD_MUTEX_INIT
+     in_profiler = 0;
+     __gthread_mutex_unlock (&__indir_topn_val_mx);
+#endif
 }
 
 #if defined(HAVE_CC_TLS) && !defined (USE_EMUTLS)
-__thread 
+__thread
 #endif
 gcov_type *__gcov_indirect_call_topn_counters ATTRIBUTE_HIDDEN;
 
@@ -338,6 +361,11 @@ __thread
 #endif
 void *__gcov_indirect_call_topn_callee ATTRIBUTE_HIDDEN;
 
+#if defined(HAVE_CC_TLS) && !defined (USE_EMUTLS)
+__thread
+#endif
+gcov_unsigned_t __gcov_indirect_call_sampling_counter ATTRIBUTE_HIDDEN;
+
 #ifdef TARGET_VTABLE_USES_DESCRIPTORS
 #define VTABLE_USES_DESCRIPTORS 1
 #else
@@ -355,12 +383,16 @@ __gcov_indirect_call_topn_profiler (void *cur_func,
      the descriptors to see if they point to the same function.  */
   if (cur_func == callee_func
       || (VTABLE_USES_DESCRIPTORS && callee_func
-	  && *(void **) cur_func == *(void **) callee_func))
+         && *(void **) cur_func == *(void **) callee_func))
     {
-      gcov_type global_id 
-          = ((struct gcov_info *) cur_module_gcov_info)->mod_info->ident;
-      global_id = GEN_FUNC_GLOBAL_ID (global_id, cur_func_id);
-      __gcov_topn_value_profiler_body (counter, global_id, GCOV_ICALL_TOPN_VAL);
+      if (++__gcov_indirect_call_sampling_counter >= __gcov_lipo_sampling_period)
+        {
+          __gcov_indirect_call_sampling_counter = 0;
+          gcov_type global_id
+              = ((struct gcov_info *) cur_module_gcov_info)->mod_info->ident;
+          global_id = GEN_FUNC_GLOBAL_ID (global_id, cur_func_id);
+          __gcov_topn_value_profiler_body (counter, global_id, GCOV_ICALL_TOPN_VAL);
+        }
       __gcov_indirect_call_topn_callee = 0;
     }
 }
@@ -376,7 +408,13 @@ gcov_type *__gcov_direct_call_counters ATTRIBUTE_HIDDEN;
 __thread
 #endif
 void *__gcov_direct_call_callee ATTRIBUTE_HIDDEN;
+#if defined(HAVE_CC_TLS) && !defined (USE_EMUTLS)
+__thread
+#endif
+gcov_unsigned_t __gcov_direct_call_sampling_counter ATTRIBUTE_HIDDEN;
+
 /* Direct call profiler. */
+
 void
 __gcov_direct_call_profiler (void *cur_func,
            void *cur_module_gcov_info,
@@ -384,11 +422,15 @@ __gcov_direct_call_profiler (void *cur_func,
 {
   if (cur_func == __gcov_direct_call_callee)
     {
-      gcov_type global_id
-          = ((struct gcov_info *) cur_module_gcov_info)->mod_info->ident;
-      global_id = GEN_FUNC_GLOBAL_ID (global_id, cur_func_id);
-      __gcov_direct_call_counters[0] = global_id;
-      __gcov_direct_call_counters[1]++;
+      if (++__gcov_direct_call_sampling_counter >= __gcov_lipo_sampling_period)
+        {
+          __gcov_direct_call_sampling_counter = 0;
+          gcov_type global_id
+              = ((struct gcov_info *) cur_module_gcov_info)->mod_info->ident;
+          global_id = GEN_FUNC_GLOBAL_ID (global_id, cur_func_id);
+          __gcov_direct_call_counters[0] = global_id;
+          __gcov_direct_call_counters[1]++;
+        }
       __gcov_direct_call_callee = 0;
     }
 }
diff --git a/libgomp/config.h.in b/libgomp/config.h.in
index 14c7e2a9a1b..9c26e6d98e2 100644
--- a/libgomp/config.h.in
+++ b/libgomp/config.h.in
@@ -76,6 +76,9 @@
 /* Define to 1 if the target supports thread-local storage. */
 #undef HAVE_TLS
 
+/* Define to 1 if the target use emutls for thread-local storage. */
+#undef USE_EMUTLS
+
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
diff --git a/libgomp/configure b/libgomp/configure
index 39bb5cdeec6..784e69f7be2 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -15535,6 +15535,37 @@ $as_echo "#define HAVE_TLS 1" >>confdefs.h
 
   fi
 
+# See if we have emulated thread-local storage.
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the thread-local storage support is from emutls" >&5
+$as_echo_n "checking whether the thread-local storage support is from emutls... " >&6; }
+if test "${gcc_cv_use_emutls+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+
+    gcc_cv_use_emutls=no
+    echo '__thread int a; int b; int main() { return a = b; }' > conftest.c
+    if { ac_try='${CC-cc} -Werror -S -o conftest.s conftest.c 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+      if grep __emutls_get_address conftest.s > /dev/null; then
+	gcc_cv_use_emutls=yes
+      fi
+    fi
+    rm -f conftest.*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_use_emutls" >&5
+$as_echo "$gcc_cv_use_emutls" >&6; }
+  if test "$gcc_cv_use_emutls" = "yes" ; then
+
+$as_echo "#define USE_EMUTLS 1" >>confdefs.h
+
+  fi
+
 # See what sort of export controls are available.
 
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the target supports hidden visibility" >&5
diff --git a/libgomp/configure.ac b/libgomp/configure.ac
index 43632f74d9d..adf3177e0e1 100644
--- a/libgomp/configure.ac
+++ b/libgomp/configure.ac
@@ -245,6 +245,9 @@ fi
 # See if we support thread-local storage.
 GCC_CHECK_TLS
 
+# See if we have emulated thread-local storage.
+GCC_CHECK_EMUTLS
+
 # See what sort of export controls are available.
 LIBGOMP_CHECK_ATTRIBUTE_VISIBILITY
 LIBGOMP_CHECK_ATTRIBUTE_DLLEXPORT
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a1482ccfbf4..b694356f67e 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -471,7 +471,7 @@ enum gomp_cancel_kind
 
 /* ... and here is that TLS data.  */
 
-#ifdef HAVE_TLS
+#if defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct gomp_thread gomp_tls_data;
 static inline struct gomp_thread *gomp_thread (void)
 {
diff --git a/libgomp/team.c b/libgomp/team.c
index e6a6d8ff679..594127ca132 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -37,7 +37,7 @@ pthread_key_t gomp_thread_destructor;
 
 
 /* This is the libgomp per-thread data structure.  */
-#ifdef HAVE_TLS
+#if defined HAVE_TLS || defined USE_EMUTLS
 __thread struct gomp_thread gomp_tls_data;
 #else
 pthread_key_t gomp_tls_key;
@@ -70,7 +70,7 @@ gomp_thread_start (void *xdata)
   void (*local_fn) (void *);
   void *local_data;
 
-#ifdef HAVE_TLS
+#if defined HAVE_TLS || defined USE_EMUTLS
   thr = &gomp_tls_data;
 #else
   struct gomp_thread local_thr;
@@ -916,7 +916,7 @@ gomp_team_end (void)
 static void __attribute__((constructor))
 initialize_team (void)
 {
-#ifndef HAVE_TLS
+#if !defined HAVE_TLS && !defined USE_EMUTLS
   static struct gomp_thread initial_thread_tls_data;
 
   pthread_key_create (&gomp_tls_key, NULL);
diff --git a/libjava/classpath/config.sub b/libjava/classpath/config.sub
index 59bb593f109..113c5ab5b13 100755
--- a/libjava/classpath/config.sub
+++ b/libjava/classpath/config.sub
@@ -287,8 +287,10 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipstx39 | mipstx39el \
@@ -403,8 +405,10 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
 	| mipstx39-* | mipstx39el-* \
diff --git a/libstdc++-v3/include/bits/algorithmfwd.h b/libstdc++-v3/include/bits/algorithmfwd.h
index aee1eec5b6b..c67ecfe4888 100644
--- a/libstdc++-v3/include/bits/algorithmfwd.h
+++ b/libstdc++-v3/include/bits/algorithmfwd.h
@@ -482,11 +482,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   template<typename _FIter, typename _Tp>
     _FIter 
-    remove(_FIter, _FIter, const _Tp&);
+    remove(_FIter, _FIter, const _Tp&)
+      __attribute__ ((warn_unused_result));
 
   template<typename _FIter, typename _Predicate>
     _FIter 
-    remove_if(_FIter, _FIter, _Predicate);
+    remove_if(_FIter, _FIter, _Predicate)
+      __attribute__ ((warn_unused_result));
 
   template<typename _IIter, typename _OIter, typename _Tp>
     _OIter 
@@ -574,11 +576,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   template<typename _FIter>
     _FIter 
-    unique(_FIter, _FIter);
+    unique(_FIter, _FIter)
+      __attribute__ ((warn_unused_result));
 
   template<typename _FIter, typename _BinaryPredicate>
     _FIter 
-    unique(_FIter, _FIter, _BinaryPredicate);
+    unique(_FIter, _FIter, _BinaryPredicate)
+      __attribute__ ((warn_unused_result));
 
   // unique_copy
 
diff --git a/libstdc++-v3/include/bits/atomic_base.h b/libstdc++-v3/include/bits/atomic_base.h
index 1fc0ebb7e40..ceead010ebe 100644
--- a/libstdc++-v3/include/bits/atomic_base.h
+++ b/libstdc++-v3/include/bits/atomic_base.h
@@ -355,7 +355,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     private:
       typedef _ITp 	__int_type;
 
-      __int_type 	_M_i;
+      static constexpr int _S_alignment =
+        sizeof(_ITp) > alignof(_ITp) ? sizeof(_ITp) : alignof(_ITp);
+
+      alignas(_S_alignment) __int_type _M_i;
 
     public:
       __atomic_base() noexcept = default;
diff --git a/libstdc++-v3/include/bits/move.h b/libstdc++-v3/include/bits/move.h
index dc4ac0fea58..512bb792158 100644
--- a/libstdc++-v3/include/bits/move.h
+++ b/libstdc++-v3/include/bits/move.h
@@ -39,10 +39,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   // Used, in C++03 mode too, by allocators, etc.
   /**
-   *  @brief Same as C++11 std::addressof
+   *  @brief Same as C++11 std::addressof, except it is constexpr in C++11.
    *  @ingroup utilities
    */
   template<typename _Tp>
+#if __cplusplus >= 201103L
+    constexpr
+#endif
     inline _Tp*
     __addressof(_Tp& __r) _GLIBCXX_NOEXCEPT
     {
diff --git a/libstdc++-v3/include/c_compatibility/complex.h b/libstdc++-v3/include/c_compatibility/complex.h
index d072b68a835..b15361ea0d5 100644
--- a/libstdc++-v3/include/c_compatibility/complex.h
+++ b/libstdc++-v3/include/c_compatibility/complex.h
@@ -26,21 +26,29 @@
  *  This is a Standard C++ Library header.
  */
 
-#include <bits/c++config.h>
+#ifndef _GLIBCXX_COMPLEX_H
+#define _GLIBCXX_COMPLEX_H 1
 
 #if __cplusplus >= 201103L
 # include <ccomplex>
+#else // C++98 and C++03
+
+// The C++ <complex> header is incompatible with the C99 <complex.h> header,
+// they cannot be included into a single translation unit portably. Notably,
+// C++11's <ccomplex> does not include C99's <complex.h> and in C++11's
+// <complex.h> is defined to provide only what C++11's <ccomplex> does in a
+// different namespace.
+#ifdef _GLIBCXX_COMPLEX
+# error Cannot include both <complex> and C99's <complex.h>
 #endif
 
-#if _GLIBCXX_HAVE_COMPLEX_H
-# include_next <complex.h>
-# ifdef _GLIBCXX_COMPLEX
-// See PR56111, keep the macro in C++03 if possible.
-#  undef complex
-# endif
-#endif
+// Delegate to a system complex.h if we don't provide it as part of the C++
+// implementation.
+#include_next <complex.h>
 
-#ifndef _GLIBCXX_COMPLEX_H
-#define _GLIBCXX_COMPLEX_H 1
+// Provide a define indicating that a C99-style <complex.h> has been included.
+#define _GLIBCXX_C99_COMPLEX_H
+
+#endif // C++98 and C++03
 
 #endif
diff --git a/libstdc++-v3/include/debug/array b/libstdc++-v3/include/debug/array
index ef01c981bd4..637ab51a348 100644
--- a/libstdc++-v3/include/debug/array
+++ b/libstdc++-v3/include/debug/array
@@ -150,15 +150,15 @@ namespace __debug
       operator[](size_type __n) noexcept
       {
 	__glibcxx_check_subscript(__n);
-	return _AT_Type::_S_ref(_M_elems, __n);
+	return *_AT_Type::_S_ptr(_M_elems, __n);
       }
 
       constexpr const_reference
       operator[](size_type __n) const noexcept
       {
-	return __n < _Nm ? _AT_Type::_S_ref(_M_elems, __n)
+	return __n < _Nm ? *_AT_Type::_S_ptr(_M_elems, __n)
 	 : (_GLIBCXX_THROW_OR_ABORT(_Array_check_subscript<_Nm>(__n)),
-	    _AT_Type::_S_ref(_M_elems, 0));
+	    *_AT_Type::_S_ptr(_M_elems, 0));
       }
 
       reference
@@ -169,7 +169,7 @@ namespace __debug
 				            "(which is %zu) >= _Nm "
 					    "(which is %zu)"),
 					__n, _Nm);
-	return _AT_Type::_S_ref(_M_elems, __n);
+	return *_AT_Type::_S_ptr(_M_elems, __n);
       }
 
       constexpr const_reference
@@ -177,11 +177,11 @@ namespace __debug
       {
 	// Result of conditional expression must be an lvalue so use
 	// boolean ? lvalue : (throw-expr, lvalue)
-	return __n < _Nm ? _AT_Type::_S_ref(_M_elems, __n)
+	return __n < _Nm ? *_AT_Type::_S_ptr(_M_elems, __n)
 	  : (std::__throw_out_of_range_fmt(__N("array::at: __n (which is %zu) "
 					       ">= _Nm (which is %zu)"),
 					   __n, _Nm),
-	     _AT_Type::_S_ref(_M_elems, 0));
+	     *_AT_Type::_S_ptr(_M_elems, 0));
       }
 
       reference 
@@ -194,9 +194,9 @@ namespace __debug
       constexpr const_reference 
       front() const noexcept
       {
-	return _Nm ? _AT_Type::_S_ref(_M_elems, 0)
+	return _Nm ? *_AT_Type::_S_ptr(_M_elems, 0)
 	  : (_GLIBCXX_THROW_OR_ABORT(_Array_check_nonempty<_Nm>()),
-	     _AT_Type::_S_ref(_M_elems, 0));
+	     *_AT_Type::_S_ptr(_M_elems, 0));
       }
 
       reference 
@@ -209,18 +209,18 @@ namespace __debug
       constexpr const_reference 
       back() const noexcept
       {
-	return _Nm ? _AT_Type::_S_ref(_M_elems, _Nm - 1)
+	return _Nm ? *_AT_Type::_S_ptr(_M_elems, _Nm - 1)
 	  : (_GLIBCXX_THROW_OR_ABORT(_Array_check_nonempty<_Nm>()),
-	     _AT_Type::_S_ref(_M_elems, 0));
+	     *_AT_Type::_S_ptr(_M_elems, 0));
       }
 
       pointer
       data() noexcept
-      { return std::__addressof(_AT_Type::_S_ref(_M_elems, 0)); }
+      { return _AT_Type::_S_ptr(_M_elems, 0); }
 
       const_pointer
       data() const noexcept
-      { return std::__addressof(_AT_Type::_S_ref(_M_elems, 0)); }
+      { return _AT_Type::_S_ptr(_M_elems, 0); }
     };
 
   // Array comparisons.
@@ -269,8 +269,8 @@ namespace __debug
     get(array<_Tp, _Nm>& __arr) noexcept
     {
       static_assert(_Int < _Nm, "index is out of bounds");
-      return _GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
-	_S_ref(__arr._M_elems, _Int);
+      return *_GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
+	_S_ptr(__arr._M_elems, _Int);
     }
 
   template<std::size_t _Int, typename _Tp, std::size_t _Nm>
@@ -286,8 +286,8 @@ namespace __debug
     get(const array<_Tp, _Nm>& __arr) noexcept
     {
       static_assert(_Int < _Nm, "index is out of bounds");
-      return _GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
-	_S_ref(__arr._M_elems, _Int);
+      return *_GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
+	_S_ptr(__arr._M_elems, _Int);
     }
 } // namespace __debug
 
diff --git a/libstdc++-v3/include/std/array b/libstdc++-v3/include/std/array
index 67680d61e58..58557901a6b 100644
--- a/libstdc++-v3/include/std/array
+++ b/libstdc++-v3/include/std/array
@@ -48,9 +48,20 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
     {
       typedef _Tp _Type[_Nm];
 
-      static constexpr _Tp&
-      _S_ref(const _Type& __t, std::size_t __n) noexcept
-      { return const_cast<_Tp&>(__t[__n]); }
+      static constexpr _Tp*
+      _S_ptr(const _Type& __t, std::size_t __n) noexcept
+#if __google_stl_debug_array
+      {
+	return __n < _Nm
+	  ? const_cast<_Tp*>(std::__addressof(__t[__n]))
+          : (std::__throw_out_of_range_fmt(__N("array::_S_ptr: __n "
+                                           "(which is %zu) >= size() "
+                                           "(which is %zu)"),
+                                           __n, _Nm), nullptr);
+      }
+#else
+      { return const_cast<_Tp*>(std::__addressof(__t[__n])); }
+#endif
     };
 
  template<typename _Tp>
@@ -58,9 +69,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
    {
      struct _Type { };
 
-     static constexpr _Tp&
-     _S_ref(const _Type&, std::size_t) noexcept
-     { return *static_cast<_Tp*>(nullptr); }
+     static constexpr _Tp*
+     _S_ptr(const _Type&, std::size_t) noexcept
+     { return static_cast<_Tp*>(nullptr); }
    };
 
   /**
@@ -170,11 +181,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       // Element access.
       reference
       operator[](size_type __n) noexcept
-      { return _AT_Type::_S_ref(_M_elems, __n); }
+      { return *_AT_Type::_S_ptr(_M_elems, __n); }
 
       constexpr const_reference
       operator[](size_type __n) const noexcept
-      { return _AT_Type::_S_ref(_M_elems, __n); }
+      { return *_AT_Type::_S_ptr(_M_elems, __n); }
 
       reference
       at(size_type __n)
@@ -183,7 +194,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 	  std::__throw_out_of_range_fmt(__N("array::at: __n (which is %zu) "
 					    ">= _Nm (which is %zu)"),
 					__n, _Nm);
-	return _AT_Type::_S_ref(_M_elems, __n);
+	return *_AT_Type::_S_ptr(_M_elems, __n);
       }
 
       constexpr const_reference
@@ -191,11 +202,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       {
 	// Result of conditional expression must be an lvalue so use
 	// boolean ? lvalue : (throw-expr, lvalue)
-	return __n < _Nm ? _AT_Type::_S_ref(_M_elems, __n)
+	return __n < _Nm ? *_AT_Type::_S_ptr(_M_elems, __n)
 	  : (std::__throw_out_of_range_fmt(__N("array::at: __n (which is %zu) "
 					       ">= _Nm (which is %zu)"),
 					   __n, _Nm),
-	     _AT_Type::_S_ref(_M_elems, 0));
+	     *_AT_Type::_S_ptr(_M_elems, 0));
       }
 
       reference 
@@ -204,7 +215,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 
       constexpr const_reference 
       front() const noexcept
-      { return _AT_Type::_S_ref(_M_elems, 0); }
+      { return *_AT_Type::_S_ptr(_M_elems, 0); }
 
       reference 
       back() noexcept
@@ -213,17 +224,17 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       constexpr const_reference 
       back() const noexcept
       { 
-	return _Nm ? _AT_Type::_S_ref(_M_elems, _Nm - 1) 
- 	           : _AT_Type::_S_ref(_M_elems, 0);
+	return _Nm ? *_AT_Type::_S_ptr(_M_elems, _Nm - 1) 
+ 	           : *_AT_Type::_S_ptr(_M_elems, 0);
       }
 
       pointer
       data() noexcept
-      { return std::__addressof(_AT_Type::_S_ref(_M_elems, 0)); }
+      { return _AT_Type::_S_ptr(_M_elems, 0); }
 
       const_pointer
       data() const noexcept
-      { return std::__addressof(_AT_Type::_S_ref(_M_elems, 0)); }
+      { return _AT_Type::_S_ptr(_M_elems, 0); }
     };
 
   // Array comparisons.
@@ -272,8 +283,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
     get(array<_Tp, _Nm>& __arr) noexcept
     {
       static_assert(_Int < _Nm, "index is out of bounds");
-      return _GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
-	_S_ref(__arr._M_elems, _Int);
+      return *_GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
+	_S_ptr(__arr._M_elems, _Int);
     }
 
   template<std::size_t _Int, typename _Tp, std::size_t _Nm>
@@ -289,8 +300,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
     get(const array<_Tp, _Nm>& __arr) noexcept
     {
       static_assert(_Int < _Nm, "index is out of bounds");
-      return _GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
-	_S_ref(__arr._M_elems, _Int);
+      return *_GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::
+	_S_ptr(__arr._M_elems, _Int);
     }
 
 _GLIBCXX_END_NAMESPACE_CONTAINER
diff --git a/libstdc++-v3/include/std/atomic b/libstdc++-v3/include/std/atomic
index ece75a4e4ba..371144628cd 100644
--- a/libstdc++-v3/include/std/atomic
+++ b/libstdc++-v3/include/std/atomic
@@ -161,7 +161,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     struct atomic
     {
     private:
-      _Tp _M_i;
+      // Align 1/2/4/8/16-byte types to at least their size.
+      static constexpr int _S_min_alignment
+	= (sizeof(_Tp) & (sizeof(_Tp) - 1)) || sizeof(_Tp) > 16
+	? 0 : sizeof(_Tp);
+
+      static constexpr int _S_alignment
+        = _S_min_alignment > alignof(_Tp) ? _S_min_alignment : alignof(_Tp);
+
+      alignas(_S_alignment) _Tp _M_i;
 
     public:
       atomic() noexcept = default;
diff --git a/libstdc++-v3/include/std/complex b/libstdc++-v3/include/std/complex
index 941e6b7d845..3f98505f730 100644
--- a/libstdc++-v3/include/std/complex
+++ b/libstdc++-v3/include/std/complex
@@ -44,8 +44,14 @@
 #include <cmath>
 #include <sstream>
 
-// Get rid of a macro possibly defined in <complex.h>
-#undef complex
+// The C++ <complex> header is incompatible with the C99 <complex.h> header,
+// they cannot be included into a single translation unit portably. Notably,
+// C++11's <ccomplex> does not include C99's <complex.h> and in C++11's
+// <complex.h> is defined to provide only what C++11's <ccomplex> does in a
+// different namespace.
+#ifdef _GLIBCXX_C99_COMPLEX_H
+#error Cannot include both <complex> and C99's <complex.h>
+#endif
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -141,11 +147,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // DR 387. std::complex over-encapsulated.
       _GLIBCXX_ABI_TAG_CXX11
       constexpr _Tp 
-      real() { return _M_real; }
+      real() const { return _M_real; }
 
       _GLIBCXX_ABI_TAG_CXX11
       constexpr _Tp 
-      imag() { return _M_imag; }
+      imag() const { return _M_imag; }
 #else
       ///  Return real part of complex number.
       _Tp& 
diff --git a/libstdc++-v3/testsuite/20_util/forward/c_neg.cc b/libstdc++-v3/testsuite/20_util/forward/c_neg.cc
index fb900e01cc2..ce414de01b7 100644
--- a/libstdc++-v3/testsuite/20_util/forward/c_neg.cc
+++ b/libstdc++-v3/testsuite/20_util/forward/c_neg.cc
@@ -18,7 +18,7 @@
 // with this library; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.
 
-// { dg-error "static assertion failed" "" { target *-*-* } 89 }
+// { dg-error "static assertion failed" "" { target *-*-* } 92 }
 
 #include <list>
 
diff --git a/libstdc++-v3/testsuite/20_util/forward/f_neg.cc b/libstdc++-v3/testsuite/20_util/forward/f_neg.cc
index c4914f0f776..4d50185448e 100644
--- a/libstdc++-v3/testsuite/20_util/forward/f_neg.cc
+++ b/libstdc++-v3/testsuite/20_util/forward/f_neg.cc
@@ -18,7 +18,7 @@
 // with this library; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.
 
-// { dg-error "static assertion failed" "" { target *-*-* } 89 }
+// { dg-error "static assertion failed" "" { target *-*-* } 92 }
 
 #include <utility>
 
diff --git a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc
index 979d7f6b5aa..64ecfc60480 100644
--- a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc
+++ b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc
@@ -28,6 +28,6 @@ int n1 = std::get<1>(a);
 int n2 = std::get<1>(std::move(a));
 int n3 = std::get<1>(ca);
 
-// { dg-error "static assertion failed" "" { target *-*-* } 274 }
-// { dg-error "static assertion failed" "" { target *-*-* } 283 }
-// { dg-error "static assertion failed" "" { target *-*-* } 291 }
+// { dg-error "static assertion failed" "" { target *-*-* } 285 }
+// { dg-error "static assertion failed" "" { target *-*-* } 294 }
+// { dg-error "static assertion failed" "" { target *-*-* } 302 }
diff --git a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc
index f80798c9108..0806ac16a35 100644
--- a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc
+++ b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc
@@ -23,4 +23,4 @@
 
 typedef std::tuple_element<1, std::array<int, 1>>::type type;
 
-// { dg-error "static assertion failed" "" { target *-*-* } 320 }
+// { dg-error "static assertion failed" "" { target *-*-* } 331 }
diff --git a/libstdc++-v3/testsuite/25_algorithms/unique/11480.cc b/libstdc++-v3/testsuite/25_algorithms/unique/11480.cc
index b007feb935d..82745856de9 100644
--- a/libstdc++-v3/testsuite/25_algorithms/unique/11480.cc
+++ b/libstdc++-v3/testsuite/25_algorithms/unique/11480.cc
@@ -33,7 +33,7 @@ void test01()
 {
   bool test __attribute__((unused)) = true;
 
-  std::unique(a, a+10, compare);
+  int* dummy __attribute__((unused)) = std::unique(a, a+10, compare);
   VERIFY( compare_count == 9 );
 }
 
diff --git a/libstdc++-v3/testsuite/26_numerics/complex/value_operations/constexpr2.cc b/libstdc++-v3/testsuite/26_numerics/complex/value_operations/constexpr2.cc
new file mode 100644
index 00000000000..9f157d37b00
--- /dev/null
+++ b/libstdc++-v3/testsuite/26_numerics/complex/value_operations/constexpr2.cc
@@ -0,0 +1,29 @@
+// { dg-do compile }
+// { dg-options "-std=gnu++14" }
+
+// Copyright (C) 2014 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#include <complex>
+
+int main()
+{
+  constexpr std::complex<int> c{};
+  constexpr auto r __attribute__((unused)) = real(c);
+  constexpr auto i __attribute__((unused)) = imag(c);
+}
+
diff --git a/libstdc++-v3/testsuite/29_atomics/atomic/62259.cc b/libstdc++-v3/testsuite/29_atomics/atomic/62259.cc
new file mode 100644
index 00000000000..42b45ec6aa3
--- /dev/null
+++ b/libstdc++-v3/testsuite/29_atomics/atomic/62259.cc
@@ -0,0 +1,59 @@
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-require-atomic-builtins "" }
+// { dg-require-cstdint "" }
+// { dg-options "-std=gnu++11" }
+// { dg-do compile }
+
+#include <atomic>
+#include <cstdint>
+
+using std::int32_t;
+using std::int64_t;
+
+// libstdc++/62259
+
+struct twoints {
+  int32_t a;
+  int32_t b;
+};
+
+static_assert( alignof(std::atomic<twoints>) == alignof(int64_t),
+               "std::atomic not suitably aligned" );
+
+// libstdc++/65147
+
+struct power_of_two_obj {
+    char c [8];
+};
+
+std::atomic<power_of_two_obj> obj1;
+
+static_assert( alignof(obj1) == alignof(int64_t),
+               "std::atomic not suitably aligned" );
+
+struct container_struct {
+   char c[1];
+   std::atomic<power_of_two_obj> ao;
+};
+
+container_struct obj2;
+
+static_assert( alignof(obj2.ao) == alignof(int64_t),
+               "std::atomic not suitably aligned" );
+
diff --git a/libstdc++-v3/testsuite/29_atomics/atomic/65147.cc b/libstdc++-v3/testsuite/29_atomics/atomic/65147.cc
new file mode 100644
index 00000000000..15c794e2b38
--- /dev/null
+++ b/libstdc++-v3/testsuite/29_atomics/atomic/65147.cc
@@ -0,0 +1,29 @@
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-options "-std=gnu++11" }
+// { dg-do compile }
+
+#include <atomic>
+
+struct S16 {
+   char c[16];
+};
+
+static_assert( alignof(std::atomic<S16>) >= 16,
+    "atomic<S16> must be aligned to at least its size" );
+
diff --git a/libstdc++-v3/testsuite/29_atomics/atomic_integral/65147.cc b/libstdc++-v3/testsuite/29_atomics/atomic_integral/65147.cc
new file mode 100644
index 00000000000..1cc33e3b8d6
--- /dev/null
+++ b/libstdc++-v3/testsuite/29_atomics/atomic_integral/65147.cc
@@ -0,0 +1,33 @@
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-options "-std=gnu++11" }
+// { dg-do compile }
+
+#include <atomic>
+
+static_assert( alignof(std::atomic<char>) >= sizeof(char),
+    "atomic<char> must be aligned to at least its size" );
+static_assert( alignof(std::atomic<short>) >= sizeof(short),
+    "atomic<short> must be aligned to at least its size" );
+static_assert( alignof(std::atomic<int>) >= sizeof(int),
+    "atomic<int> must be aligned to at least its size" );
+static_assert( alignof(std::atomic<long>) >= sizeof(long),
+    "atomic<long> must be aligned to at least its size" );
+static_assert( alignof(std::atomic<long long>) >= sizeof(long long),
+    "atomic<long long> must be aligned to at least its size" );
+
author	ctice <ctice@138bc75d-0d04-0410-961f-82ee72b054a4>	2015-09-24 20:39:21 +0000
committer	ctice <ctice@138bc75d-0d04-0410-961f-82ee72b054a4>	2015-09-24 20:39:21 +0000
commit	91cfdb1db6b46c452db5f1ceb411fe6355b4ebb0 (patch)
tree	3a5e109099a823faa295796293079d514cfb6b24
parent	868c80ab78c96ba97f5fdad11c889aaecafe2e9e (diff)
parent	1bba763e8b31cd3e68de6aad23ad6a31f8b5b118 (diff)