14 files changed, 1166 insertions, 167 deletions
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index f8342ca722d..10893324d3f 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -65,8 +65,8 @@ AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  8A,  AARCH64_FL_FOR_ARCH
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, 0x50, 0x000, -1)
 
 /* Qualcomm ('Q') cores. */
-AARCH64_CORE("falkor",      falkor,    cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   0x51, 0xC00, -1)
-AARCH64_CORE("qdf24xx",     qdf24xx,   cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, qdf24xx,   0x51, 0xC00, -1)
+AARCH64_CORE("falkor",      falkor,    falkor,    8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_RDMA, qdf24xx,   0x51, 0xC00, -1)
+AARCH64_CORE("qdf24xx",     qdf24xx,   falkor,    8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO | AARCH64_FL_RDMA, qdf24xx,   0x51, 0xC00, -1)
 
 /* Samsung ('S') cores. */
 AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1,  0x53, 0x001, -1)
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index c0752ce3470..c4f059ab7c5 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -63,4 +63,8 @@ AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, 0, "fphp asimdhp")
 /* Enabling or disabling "rcpc" only changes "rcpc".  */
 AARCH64_OPT_EXTENSION("rcpc", AARCH64_FL_RCPC, 0, 0, "lrcpc")
 
+/* Enabling "rdma" also enables "fp", "simd".
+   Disabling "rdma" just disables "rdma".  */
+AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, AARCH64_FL_FP | AARCH64_FL_SIMD, 0, "rdma")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e397ff4afa7..beff28e2272 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -319,6 +319,7 @@ unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
 int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
+bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
@@ -326,6 +327,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 void aarch64_expand_call (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
+bool aarch64_float_const_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
 bool aarch64_gen_movmemqi (rtx *);
@@ -351,9 +353,9 @@ bool aarch64_pad_arg_upward (machine_mode, const_tree);
 bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
 bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
+bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
 bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 					    bool high);
-bool aarch64_simd_imm_scalar_p (rtx x, machine_mode mode);
 bool aarch64_simd_imm_zero_p (rtx, machine_mode);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1cb6eeb3187..f74b68775cf 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -351,6 +351,35 @@
   }
 )
 
+(define_expand "xorsign<mode>3"
+  [(match_operand:VHSDF 0 "register_operand")
+   (match_operand:VHSDF 1 "register_operand")
+   (match_operand:VHSDF 2 "register_operand")]
+  "TARGET_SIMD"
+{
+
+  machine_mode imode = <V_cmp_result>mode;
+  rtx v_bitmask = gen_reg_rtx (imode);
+  rtx op1x = gen_reg_rtx (imode);
+  rtx op2x = gen_reg_rtx (imode);
+
+  rtx arg1 = lowpart_subreg (imode, operands[1], <MODE>mode);
+  rtx arg2 = lowpart_subreg (imode, operands[2], <MODE>mode);
+
+  int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+  emit_move_insn (v_bitmask,
+		  aarch64_simd_gen_const_vector_dup (<V_cmp_result>mode,
+						     HOST_WIDE_INT_M1U << bits));
+
+  emit_insn (gen_and<v_cmp_result>3 (op2x, v_bitmask, arg2));
+  emit_insn (gen_xor<v_cmp_result>3 (op1x, arg1, op2x));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, op1x, imode));
+  DONE;
+}
+)
+
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
@@ -1033,6 +1062,18 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
+(define_insn "*aarch64_mla_elt_merge<mode>"
+  [(set (match_operand:VDQHS 0 "register_operand" "=w")
+	(plus:VDQHS
+	  (mult:VDQHS (vec_duplicate:VDQHS
+		  (match_operand:<VEL> 1 "register_operand" "w"))
+		(match_operand:VDQHS 2 "register_operand" "w"))
+	  (match_operand:VDQHS 3 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
+)
+
 (define_insn "aarch64_mls<mode>"
  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
        (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0")
@@ -1080,6 +1121,18 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
+(define_insn "*aarch64_mls_elt_merge<mode>"
+  [(set (match_operand:VDQHS 0 "register_operand" "=w")
+	(minus:VDQHS
+	  (match_operand:VDQHS 1 "register_operand" "0")
+	  (mult:VDQHS (vec_duplicate:VDQHS
+		  (match_operand:<VEL> 2 "register_operand" "w"))
+		(match_operand:VDQHS 3 "register_operand" "w"))))]
+  "TARGET_SIMD"
+  "mls\t%0.<Vtype>, %3.<Vtype>, %2.<Vetype>[0]"
+  [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
+)
+
 ;; Max/Min operations.
 (define_insn "<su><maxmin><mode>3"
  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
@@ -5593,9 +5646,9 @@
   DONE;
 })
 
-;; Standard pattern name vec_init<mode>.
+;; Standard pattern name vec_init<mode><Vel>.
 
-(define_expand "vec_init<mode>"
+(define_expand "vec_init<mode><Vel>"
   [(match_operand:VALL_F16 0 "register_operand" "")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
@@ -5650,9 +5703,9 @@
  "urecpe\\t%0.<Vtype>, %1.<Vtype>"
   [(set_attr "type" "neon_fp_recpe_<Vetype><q>")])
 
-;; Standard pattern name vec_extract<mode>.
+;; Standard pattern name vec_extract<mode><Vel>.
 
-(define_expand "vec_extract<mode>"
+(define_expand "vec_extract<mode><Vel>"
   [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "")
    (match_operand:VALL_F16 1 "register_operand" "")
    (match_operand:SI 2 "immediate_operand" "")]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b8a4160d9de..28c4e0e6476 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -33,6 +33,7 @@
 #include "df.h"
 #include "tm_p.h"
 #include "stringpool.h"
+#include "attribs.h"
 #include "optabs.h"
 #include "regs.h"
 #include "emit-rtl.h"
@@ -147,6 +148,8 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 							 const_tree type,
 							 int misalignment,
 							 bool is_packed);
+static machine_mode
+aarch64_simd_container_mode (machine_mode mode, unsigned width);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -206,22 +209,6 @@ static const struct cpu_addrcost_table generic_addrcost_table =
   0 /* imm_offset  */
 };
 
-static const struct cpu_addrcost_table cortexa57_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0, /* imm_offset  */
-};
-
 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 {
     {
@@ -254,22 +241,6 @@ static const struct cpu_addrcost_table xgene1_addrcost_table =
   0, /* imm_offset  */
 };
 
-static const struct cpu_addrcost_table qdf24xx_addrcost_table =
-{
-    {
-      1, /* hi  */
-      0, /* si  */
-      0, /* di  */
-      1, /* ti  */
-    },
-  0, /* pre_modify  */
-  0, /* post_modify  */
-  0, /* register_offset  */
-  0, /* register_sextend  */
-  0, /* register_zextend  */
-  0 /* imm_offset  */
-};
-
 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 {
     {
@@ -390,13 +361,13 @@ static const struct cpu_vector_cost thunderx_vector_cost =
   3, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
   4, /* vec_int_stmt_cost  */
-  4, /* vec_fp_stmt_cost  */
+  1, /* vec_fp_stmt_cost  */
   4, /* vec_permute_cost  */
   2, /* vec_to_scalar_cost  */
   2, /* scalar_to_vec_cost  */
   3, /* vec_align_load_cost  */
-  10, /* vec_unalign_load_cost  */
-  10, /* vec_unalign_store_cost  */
+  5, /* vec_unalign_load_cost  */
+  5, /* vec_unalign_store_cost  */
   1, /* vec_store_cost  */
   3, /* cond_taken_branch_cost  */
   3 /* cond_not_taken_branch_cost  */
@@ -488,20 +459,6 @@ static const struct cpu_branch_cost generic_branch_cost =
   3   /* Unpredictable.  */
 };
 
-/* Branch costs for Cortex-A57.  */
-static const struct cpu_branch_cost cortexa57_branch_cost =
-{
-  1,  /* Predictable.  */
-  3   /* Unpredictable.  */
-};
-
-/* Branch costs for Vulcan.  */
-static const struct cpu_branch_cost thunderx2t99_branch_cost =
-{
-  1,  /* Predictable.  */
-  3   /* Unpredictable.  */
-};
-
 /* Generic approximation modes.  */
 static const cpu_approx_modes generic_approx_modes =
 {
@@ -612,7 +569,7 @@ static const struct tune_params cortexa35_tunings =
   &generic_addrcost_table,
   &cortexa53_regmove_cost,
   &generic_vector_cost,
-  &cortexa57_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost  */
   1, /* issue_rate  */
@@ -638,7 +595,7 @@ static const struct tune_params cortexa53_tunings =
   &generic_addrcost_table,
   &cortexa53_regmove_cost,
   &generic_vector_cost,
-  &cortexa57_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
@@ -661,10 +618,10 @@ static const struct tune_params cortexa53_tunings =
 static const struct tune_params cortexa57_tunings =
 {
   &cortexa57_extra_costs,
-  &cortexa57_addrcost_table,
+  &generic_addrcost_table,
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
-  &cortexa57_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
@@ -687,10 +644,10 @@ static const struct tune_params cortexa57_tunings =
 static const struct tune_params cortexa72_tunings =
 {
   &cortexa57_extra_costs,
-  &cortexa57_addrcost_table,
+  &generic_addrcost_table,
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
-  &cortexa57_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
@@ -713,10 +670,10 @@ static const struct tune_params cortexa72_tunings =
 static const struct tune_params cortexa73_tunings =
 {
   &cortexa57_extra_costs,
-  &cortexa57_addrcost_table,
+  &generic_addrcost_table,
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
-  &cortexa57_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost.  */
   2, /* issue_rate.  */
@@ -842,7 +799,7 @@ static const struct tune_params xgene1_tunings =
 static const struct tune_params qdf24xx_tunings =
 {
   &qdf24xx_extra_costs,
-  &qdf24xx_addrcost_table,
+  &generic_addrcost_table,
   &qdf24xx_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
@@ -871,7 +828,7 @@ static const struct tune_params thunderx2t99_tunings =
   &thunderx2t99_addrcost_table,
   &thunderx2t99_regmove_cost,
   &thunderx2t99_vector_cost,
-  &thunderx2t99_branch_cost,
+  &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost.  */
   4, /* issue_rate.  */
@@ -1876,6 +1833,31 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
       return 1;
     }
 
+  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
+     (with XXXX non-zero). In that case check to see if the move can be done in
+     a smaller mode.  */
+  val2 = val & 0xffffffff;
+  if (mode == DImode
+      && aarch64_move_imm (val2, SImode)
+      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
+    {
+      if (generate)
+	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+
+      /* Check if we have to emit a second instruction by checking to see
+         if any of the upper 32 bits of the original DI mode value is set.  */
+      if (val == val2)
+	return 1;
+
+      i = (val >> 48) ? 48 : 32;
+
+      if (generate)
+	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+				    GEN_INT ((val >> i) & 0xffff)));
+
+      return 2;
+    }
+
   if ((val >> 32) == 0 || mode == SImode)
     {
       if (generate)
@@ -3088,7 +3070,7 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
   reg = gen_rtx_REG (mode, regno);
   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
 			    plus_constant (Pmode, base_rtx, -adjustment));
-  mem = gen_rtx_MEM (mode, mem);
+  mem = gen_frame_mem (mode, mem);
 
   insn = emit_move_insn (mem, reg);
   RTX_FRAME_RELATED_P (insn) = 1;
@@ -3176,7 +3158,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
     {
       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
-      emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
+      emit_move_insn (reg1, gen_frame_mem (mode, mem));
     }
   else
     {
@@ -3252,8 +3234,6 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
 			   unsigned start, unsigned limit, bool skip_wb)
 {
   rtx_insn *insn;
-  rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
-						 ? gen_frame_mem : gen_rtx_MEM);
   unsigned regno;
   unsigned regno2;
 
@@ -3274,8 +3254,8 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
 
       reg = gen_rtx_REG (mode, regno);
       offset = start_offset + cfun->machine->frame.reg_offset[regno];
-      mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
-					      offset));
+      mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
+						offset));
 
       regno2 = aarch64_next_callee_save (regno + 1, limit);
 
@@ -3289,8 +3269,8 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
 	  rtx mem2;
 
 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
-	  mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
-						   offset));
+	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
+						     offset));
 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
 						    reg2));
 
@@ -3319,8 +3299,6 @@ aarch64_restore_callee_saves (machine_mode mode,
 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
 {
   rtx base_rtx = stack_pointer_rtx;
-  rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
-						 ? gen_frame_mem : gen_rtx_MEM);
   unsigned regno;
   unsigned regno2;
   HOST_WIDE_INT offset;
@@ -3341,7 +3319,7 @@ aarch64_restore_callee_saves (machine_mode mode,
 
       reg = gen_rtx_REG (mode, regno);
       offset = start_offset + cfun->machine->frame.reg_offset[regno];
-      mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
+      mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
 
       regno2 = aarch64_next_callee_save (regno + 1, limit);
 
@@ -3354,7 +3332,7 @@ aarch64_restore_callee_saves (machine_mode mode,
 	  rtx mem2;
 
 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
-	  mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
+	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
 
 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
@@ -4723,6 +4701,74 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
   return true;
 }
 
+/* Return the binary representation of floating point constant VALUE in INTVAL.
+   If the value cannot be converted, return false without setting INTVAL.
+   The conversion is done in the given MODE.  */
+bool
+aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
+{
+
+  /* We make a general exception for 0.  */
+  if (aarch64_float_const_zero_rtx_p (value))
+    {
+      *intval = 0;
+      return true;
+    }
+
+  machine_mode mode = GET_MODE (value);
+  if (GET_CODE (value) != CONST_DOUBLE
+      || !SCALAR_FLOAT_MODE_P (mode)
+      || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
+      /* Only support up to DF mode.  */
+      || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
+    return false;
+
+  unsigned HOST_WIDE_INT ival = 0;
+
+  long res[2];
+  real_to_target (res,
+		  CONST_DOUBLE_REAL_VALUE (value),
+		  REAL_MODE_FORMAT (mode));
+
+  if (mode == DFmode)
+    {
+      int order = BYTES_BIG_ENDIAN ? 1 : 0;
+      ival = zext_hwi (res[order], 32);
+      ival |= (zext_hwi (res[1 - order], 32) << 32);
+    }
+  else
+      ival = zext_hwi (res[0], 32);
+
+  *intval = ival;
+  return true;
+}
+
+/* Return TRUE if rtx X is an immediate constant that can be moved using a
+   single MOV(+MOVK) followed by an FMOV.  */
+bool
+aarch64_float_const_rtx_p (rtx x)
+{
+  machine_mode mode = GET_MODE (x);
+  if (mode == VOIDmode)
+    return false;
+
+  /* Determine whether it's cheaper to write float constants as
+     mov/movk pairs over ldr/adrp pairs.  */
+  unsigned HOST_WIDE_INT ival;
+
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode)
+      && aarch64_reinterpret_float_as_int (x, &ival))
+    {
+      machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
+      int num_instr = aarch64_internal_mov_immediate
+			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+      return num_instr < 3;
+    }
+
+  return false;
+}
+
 /* Return TRUE if rtx X is immediate constant 0.0 */
 bool
 aarch64_float_const_zero_rtx_p (rtx x)
@@ -4735,6 +4781,49 @@ aarch64_float_const_zero_rtx_p (rtx x)
   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
 }
 
+/* Return TRUE if rtx X is immediate constant that fits in a single
+   MOVI immediate operation.  */
+bool
+aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
+{
+  if (!TARGET_SIMD)
+     return false;
+
+  machine_mode vmode, imode;
+  unsigned HOST_WIDE_INT ival;
+
+  if (GET_CODE (x) == CONST_DOUBLE
+      && SCALAR_FLOAT_MODE_P (mode))
+    {
+      if (!aarch64_reinterpret_float_as_int (x, &ival))
+	return false;
+
+      /* We make a general exception for 0.  */
+      if (aarch64_float_const_zero_rtx_p (x))
+	return true;
+
+      imode = int_mode_for_mode (mode);
+    }
+  else if (GET_CODE (x) == CONST_INT
+	   && SCALAR_INT_MODE_P (mode))
+    {
+       imode = mode;
+       ival = INTVAL (x);
+    }
+  else
+    return false;
+
+   /* use a 64 bit mode for everything except for DI/DF mode, where we use
+     a 128 bit vector mode.  */
+  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
+
+  vmode = aarch64_simd_container_mode (imode, width);
+  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
+
+  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
+}
+
+
 /* Return the fixed registers used for condition codes.  */
 
 static bool
@@ -5929,12 +6018,6 @@ aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
       return NO_REGS;
     }
 
-  /* If it's an integer immediate that MOVI can't handle, then
-     FP_REGS is not an option, so we return NO_REGS instead.  */
-  if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
-      && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
-    return NO_REGS;
-
   /* Register eliminiation can result in a request for
      SP+constant->FP_REGS.  We cannot support such operations which
      use SP as source and an FP_REG as destination, so reject out
@@ -6884,6 +6967,25 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
       return true;
 
     case CONST_DOUBLE:
+
+      /* First determine number of instructions to do the move
+	  as an integer constant.  */
+      if (!aarch64_float_const_representable_p (x)
+	   && !aarch64_can_const_movi_rtx_p (x, mode)
+	   && aarch64_float_const_rtx_p (x))
+	{
+	  unsigned HOST_WIDE_INT ival;
+	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
+	  gcc_assert (succeed);
+
+	  machine_mode imode = mode == HFmode ? SImode
+					      : int_mode_for_mode (mode);
+	  int ncost = aarch64_internal_mov_immediate
+		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
+	  *cost += COSTS_N_INSNS (ncost);
+	  return true;
+	}
+
       if (speed)
 	{
 	  /* mov[df,sf]_aarch64.  */
@@ -10193,7 +10295,7 @@ aarch64_classify_symbol (rtx x, rtx offset)
 	  /* This is alright even in PIC code as the constant
 	     pool reference is always PC relative and within
 	     the same translation unit.  */
-	  if (CONSTANT_POOL_ADDRESS_P (x))
+	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
 	    return SYMBOL_SMALL_ABSOLUTE;
 	  else
 	    return SYMBOL_FORCE_TO_MEM;
@@ -10228,18 +10330,16 @@ aarch64_legitimate_pic_operand_p (rtx x)
 /* Return true if X holds either a quarter-precision or
      floating-point +0.0 constant.  */
 static bool
-aarch64_valid_floating_const (machine_mode mode, rtx x)
+aarch64_valid_floating_const (rtx x)
 {
   if (!CONST_DOUBLE_P (x))
     return false;
 
-  if (aarch64_float_const_zero_rtx_p (x))
+  /* This call determines which constants can be used in mov<mode>
+     as integer moves instead of constant loads.  */
+  if (aarch64_float_const_rtx_p (x))
     return true;
 
-  /* We only handle moving 0.0 to a TFmode register.  */
-  if (!(mode == SFmode || mode == DFmode))
-    return false;
-
   return aarch64_float_const_representable_p (x);
 }
 
@@ -10251,11 +10351,15 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x)
   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
     return false;
 
-  /* This could probably go away because
-     we now decompose CONST_INTs according to expand_mov_immediate.  */
+  /* For these cases we never want to use a literal load.
+     As such we have to prevent the compiler from forcing these
+     to memory.  */
   if ((GET_CODE (x) == CONST_VECTOR
        && aarch64_simd_valid_immediate (x, mode, false, NULL))
-      || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
+      || CONST_INT_P (x)
+      || aarch64_valid_floating_const (x)
+      || aarch64_can_const_movi_rtx_p (x, mode)
+      || aarch64_float_const_rtx_p (x))
 	return !targetm.cannot_force_const_mem (mode, x);
 
   if (GET_CODE (x) == HIGH
@@ -11538,23 +11642,6 @@ aarch64_mask_from_zextract_ops (rtx width, rtx pos)
 }
 
 bool
-aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
-{
-  HOST_WIDE_INT imm = INTVAL (x);
-  int i;
-
-  for (i = 0; i < 8; i++)
-    {
-      unsigned int byte = imm & 0xff;
-      if (byte != 0xff && byte != 0)
-       return false;
-      imm >>= 8;
-    }
-
-  return true;
-}
-
-bool
 aarch64_mov_operand_p (rtx x, machine_mode mode)
 {
   if (GET_CODE (x) == HIGH
@@ -12945,15 +13032,28 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
 }
 
 char*
-aarch64_output_scalar_simd_mov_immediate (rtx immediate,
-					  machine_mode mode)
+aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
 {
+
+  /* If a floating point number was passed and we desire to use it in an
+     integer mode do the conversion to integer.  */
+  if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
+    {
+      unsigned HOST_WIDE_INT ival;
+      if (!aarch64_reinterpret_float_as_int (immediate, &ival))
+	  gcc_unreachable ();
+      immediate = gen_int_mode (ival, mode);
+    }
+
   machine_mode vmode;
+  /* use a 64 bit mode for everything except for DI/DF mode, where we use
+     a 128 bit vector mode.  */
+  int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
 
   gcc_assert (!VECTOR_MODE_P (mode));
-  vmode = aarch64_simd_container_mode (mode, 64);
+  vmode = aarch64_simd_container_mode (mode, width);
   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
-  return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
+  return aarch64_output_simd_mov_immediate (v_op, vmode, width);
 }
 
 /* Split operands into moves from op[1] + op[2] into op[0].  */
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 106cf3a5666..7f91edb5713 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -144,7 +144,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_CRC        (1 << 3)	/* Has CRC.  */
 /* ARMv8.1-A architecture extensions.  */
 #define AARCH64_FL_LSE	      (1 << 4)  /* Has Large System Extensions.  */
-#define AARCH64_FL_V8_1	      (1 << 5)  /* Has ARMv8.1-A extensions.  */
+#define AARCH64_FL_RDMA	      (1 << 5)  /* Has Round Double Multiply Add.  */
+#define AARCH64_FL_V8_1	      (1 << 6)  /* Has ARMv8.1-A extensions.  */
 /* ARMv8.2-A architecture extensions.  */
 #define AARCH64_FL_V8_2	      (1 << 8)  /* Has ARMv8.2-A features.  */
 #define AARCH64_FL_F16	      (1 << 9)  /* Has ARMv8.2-A FP16 extensions.  */
@@ -161,7 +162,8 @@ extern unsigned aarch64_architecture_version;
 /* Architecture flags that effect instruction selection.  */
 #define AARCH64_FL_FOR_ARCH8       (AARCH64_FL_FPSIMD)
 #define AARCH64_FL_FOR_ARCH8_1			       \
-  (AARCH64_FL_FOR_ARCH8 | AARCH64_FL_LSE | AARCH64_FL_CRC | AARCH64_FL_V8_1)
+  (AARCH64_FL_FOR_ARCH8 | AARCH64_FL_LSE | AARCH64_FL_CRC \
+   | AARCH64_FL_RDMA | AARCH64_FL_V8_1)
 #define AARCH64_FL_FOR_ARCH8_2			\
   (AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_V8_2)
 #define AARCH64_FL_FOR_ARCH8_3			\
@@ -174,7 +176,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_FP             (aarch64_isa_flags & AARCH64_FL_FP)
 #define AARCH64_ISA_SIMD           (aarch64_isa_flags & AARCH64_FL_SIMD)
 #define AARCH64_ISA_LSE		   (aarch64_isa_flags & AARCH64_FL_LSE)
-#define AARCH64_ISA_RDMA	   (aarch64_isa_flags & AARCH64_FL_V8_1)
+#define AARCH64_ISA_RDMA	   (aarch64_isa_flags & AARCH64_FL_RDMA)
 #define AARCH64_ISA_V8_2	   (aarch64_isa_flags & AARCH64_FL_V8_2)
 #define AARCH64_ISA_F16		   (aarch64_isa_flags & AARCH64_FL_F16)
 #define AARCH64_ISA_V8_3	   (aarch64_isa_flags & AARCH64_FL_V8_3)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f876a2b7208..64b60a903ed 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -181,6 +181,11 @@
 ;; will be disabled when !TARGET_FLOAT.
 (define_attr "fp" "no,yes" (const_string "no"))
 
+;; Attribute that specifies whether or not the instruction touches half
+;; precision fp registers.  When this is set to yes for an alternative,
+;; that alternative will be disabled when !TARGET_FP_F16INST.
+(define_attr "fp16" "no,yes" (const_string "no"))
+
 ;; Attribute that specifies whether or not the instruction touches simd
 ;; registers.  When this is set to yes for an alternative, that alternative
 ;; will be disabled when !TARGET_SIMD.
@@ -194,11 +199,14 @@
 ;; registers when -mgeneral-regs-only is specified.
 (define_attr "enabled" "no,yes"
   (cond [(ior
-	(and (eq_attr "fp" "yes")
-	     (eq (symbol_ref "TARGET_FLOAT") (const_int 0)))
-	(and (eq_attr "simd" "yes")
-	     (eq (symbol_ref "TARGET_SIMD") (const_int 0))))
-	     (const_string "no")
+	    (ior
+		(and (eq_attr "fp" "yes")
+		     (eq (symbol_ref "TARGET_FLOAT") (const_int 0)))
+		(and (eq_attr "simd" "yes")
+		     (eq (symbol_ref "TARGET_SIMD") (const_int 0))))
+	    (and (eq_attr "fp16" "yes")
+		 (eq (symbol_ref "TARGET_FP_F16INST") (const_int 0))))
+	    (const_string "no")
 	] (const_string "yes")))
 
 ;; Attribute that specifies whether we are dealing with a branch to a
@@ -223,6 +231,7 @@
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a57.md")
 (include "../arm/exynos-m1.md")
+(include "falkor.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
 (include "thunderx2t99.md")
@@ -920,8 +929,8 @@
 )
 
 (define_insn_and_split "*movsi_aarch64"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w,r,*w")
-	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r  ,*w, r,*w,w")
+	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w,Ds"))]
   "(register_operand (operands[0], SImode)
     || aarch64_reg_or_zero (operands[1], SImode))"
   "@
@@ -938,8 +947,9 @@
    adrp\\t%x0, %A1
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
-   fmov\\t%s0, %s1"
-   "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
+   fmov\\t%s0, %s1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
+  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode)
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
    "{
@@ -947,13 +957,14 @@
        DONE;
     }"
   [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-                     adr,adr,f_mcr,f_mrc,fmov")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes")]
+		    adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
+   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w,m,  m,r,r,  *w,r,*w,w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r,*w,m,  m,r,r,  *w,r,*w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,m, m,rZ,*w,Usa,Ush,rZ,w,*w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -961,6 +972,7 @@
    mov\\t%0, %x1
    mov\\t%x0, %1
    mov\\t%x0, %1
+   mov\\t%w0, %1
    #
    ldr\\t%x0, %1
    ldr\\t%d0, %1
@@ -971,7 +983,7 @@
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
-   movi\\t%d0, %1"
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
    "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode))
     && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
    [(const_int 0)]
@@ -979,10 +991,10 @@
        aarch64_expand_mov_immediate (operands[0], operands[1]);
        DONE;
     }"
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-                     adr,adr,f_mcr,f_mrc,fmov,neon_move")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
-   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,load1,\
+                     load1,store1,store1,adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "fp" "*,*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
+   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
 
 (define_insn "insv_imm<mode>"
@@ -1062,28 +1074,31 @@
 )
 
 (define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,m,w,m,rY,r"))]
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
     || aarch64_reg_or_fp_zero (operands[1], HFmode))"
   "@
    movi\\t%0.4h, #0
-   mov\\t%0.h[0], %w1
+   fmov\\t%h0, %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
+   fmov\\t%h0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%h0, %1
    str\\t%h1, %0
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+		     neon_move,f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")
+   (set_attr "fp16"   "*,yes,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
     || aarch64_reg_or_fp_zero (operands[1], SFmode))"
   "@
@@ -1092,19 +1107,22 @@
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
    ldr\\t%s0, %1
    str\\t%s1, %0
    ldr\\t%w0, %1
    str\\t%w1, %0
-   mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
-                     f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%w0, %w1
+   mov\\t%w0, %1"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
+		     f_loads,f_stores,load1,store1,mov_reg,\
+		     fconsts")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
 (define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"      "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:DF 1 "general_operand"      "Y , ?rY, w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
   "TARGET_FLOAT && (register_operand (operands[0], DFmode)
     || aarch64_reg_or_fp_zero (operands[1], DFmode))"
   "@
@@ -1113,14 +1131,37 @@
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    fmov\\t%d0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);
    ldr\\t%d0, %1
    str\\t%d1, %0
    ldr\\t%x0, %1
    str\\t%x1, %0
-   mov\\t%x0, %x1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,\
-                     f_loadd,f_stored,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%x0, %x1
+   mov\\t%x0, %1"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
+		     f_loadd,f_stored,load1,store1,mov_reg,\
+		     fconstd")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
+)
+
+(define_split
+  [(set (match_operand:GPF_HF 0 "nonimmediate_operand")
+	(match_operand:GPF_HF 1 "general_operand"))]
+  "can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  {
+    unsigned HOST_WIDE_INT ival;
+    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
+      FAIL;
+
+    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
+    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
+    DONE;
+  }
 )
 
 (define_insn "*movtf_aarch64"
@@ -3835,6 +3876,22 @@
   [(set_attr "type" "logics_reg,logics_imm")]
 )
 
+(define_split
+  [(set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (and:GPI (match_operand:GPI 0 "register_operand")
+		  (match_operand:GPI 1 "aarch64_mov_imm_operand"))
+	 (const_int 0)))
+   (clobber (match_operand:SI 2 "register_operand"))]
+  ""
+  [(set (match_dup 2) (match_dup 1))
+   (set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (and:GPI (match_dup 0)
+		  (match_dup 2))
+	 (const_int 0)))]
+)
+
 (define_insn "*and<mode>3nr_compare0_zextract"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -3870,6 +3927,26 @@
   [(set_attr "type" "logics_shift_imm")]
 )
 
+(define_split
+  [(set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (and:GPI (SHIFT:GPI
+		   (match_operand:GPI 0 "register_operand")
+		   (match_operand:QI 1 "aarch64_shift_imm_<mode>"))
+		  (match_operand:GPI 2 "aarch64_mov_imm_operand"))
+	(const_int 0)))
+    (clobber (match_operand:SI 3 "register_operand"))]
+  ""
+  [(set (match_dup 3) (match_dup 2))
+   (set (reg:CC_NZ CC_REGNUM)
+	(compare:CC_NZ
+	 (and:GPI (SHIFT:GPI
+		   (match_dup 0)
+		   (match_dup 1))
+		  (match_dup 3))
+	 (const_int 0)))]
+)
+
 ;; -------------------------------------------------------------------
 ;; Shifts
 ;; -------------------------------------------------------------------
@@ -5102,6 +5179,42 @@
 }
 )
 
+;; For xorsign (x, y), we want to generate:
+;;
+;; LDR   d2, #1<<63
+;; AND   v3.8B, v1.8B, v2.8B
+;; EOR   v0.8B, v0.8B, v3.8B
+;;
+
+(define_expand "xorsign<mode>3"
+  [(match_operand:GPF 0 "register_operand")
+   (match_operand:GPF 1 "register_operand")
+   (match_operand:GPF 2 "register_operand")]
+  "TARGET_FLOAT && TARGET_SIMD"
+{
+
+  machine_mode imode = <V_cmp_result>mode;
+  rtx mask = gen_reg_rtx (imode);
+  rtx op1x = gen_reg_rtx (imode);
+  rtx op2x = gen_reg_rtx (imode);
+
+  int bits = GET_MODE_BITSIZE (<MODE>mode) - 1;
+  emit_move_insn (mask, GEN_INT (trunc_int_for_mode (HOST_WIDE_INT_M1U << bits,
+						     imode)));
+
+  emit_insn (gen_and<v_cmp_result>3 (op2x, mask,
+				     lowpart_subreg (imode, operands[2],
+						     <MODE>mode)));
+  emit_insn (gen_xor<v_cmp_result>3 (op1x,
+				     lowpart_subreg (imode, operands[1],
+						     <MODE>mode),
+				     op2x));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, op1x, imode));
+  DONE;
+}
+)
+
 ;; -------------------------------------------------------------------
 ;; Reload support
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0753da32f59..d7b30b0e5ee 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -12162,7 +12162,7 @@ vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
 
 /* ARMv8.1-A instrinsics.  */
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.1-a")
+#pragma GCC target ("+nothing+rdma")
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 88e840f2898..9ce3d4efaf3 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -176,6 +176,12 @@
   (and (match_code "const_double")
        (match_test "aarch64_float_const_representable_p (op)")))
 
+(define_constraint "Uvi"
+  "A floating point constant which can be used with a\
+   MOVI immediate operation."
+  (and (match_code "const_double")
+       (match_test "aarch64_can_const_movi_rtx_p (op, GET_MODE (op))")))
+
 (define_constraint "Dn"
   "@internal
  A constraint that matches vector of immediates."
@@ -220,9 +226,17 @@
 
 (define_constraint "Dd"
   "@internal
- A constraint that matches an immediate operand valid for AdvSIMD scalar."
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in DImode."
+ (and (match_code "const_int")
+      (match_test "aarch64_can_const_movi_rtx_p (op, DImode)")))
+
+(define_constraint "Ds"
+  "@internal
+ A constraint that matches an integer immediate operand valid\
+ for AdvSIMD scalar operations in SImode."
  (and (match_code "const_int")
-      (match_test "aarch64_simd_imm_scalar_p (op, GET_MODE (op))")))
+      (match_test "aarch64_can_const_movi_rtx_p (op, SImode)")))
 
 (define_address_constraint "Dp"
   "@internal
diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.c b/gcc/config/aarch64/cortex-a57-fma-steering.c
index 6d90acdd4a2..fa8c56aab02 100644
--- a/gcc/config/aarch64/cortex-a57-fma-steering.c
+++ b/gcc/config/aarch64/cortex-a57-fma-steering.c
@@ -973,10 +973,17 @@ func_fma_steering::analyze ()
 		break;
 	    }
 
-	  /* We didn't find a chain with a def for this instruction.  */
-	  gcc_assert (i < dest_op_info->n_chains);
-
-	  this->analyze_fma_fmul_insn (forest, chain, head);
+	  /* Due to implementation of regrename, dest register can slip away
+	     from regrename's analysis.  As a result, there is no chain for
+	     the destination register of insn.  We simply skip the insn even
+	     it is a fmul/fmac instruction.  This can happen when the dest
+	     register is also a source register of insn and one of the below
+	     conditions is satisfied:
+	       1) the source reg is setup in larger mode than this insn;
+	       2) the source reg is uninitialized;
+	       3) the source reg is passed in as parameter.  */
+	  if (i < dest_op_info->n_chains)
+	    this->analyze_fma_fmul_insn (forest, chain, head);
 	}
     }
   free (bb_dfs_preorder);
diff --git a/gcc/config/aarch64/falkor.md b/gcc/config/aarch64/falkor.md
new file mode 100644
index 00000000000..b422ab30c44
--- /dev/null
+++ b/gcc/config/aarch64/falkor.md
@@ -0,0 +1,681 @@
+;; Falkor pipeline description
+;; Copyright (C) 2017 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "falkor")
+
+;; Complex int instructions (e.g. multiply and divide) execute in the X
+;; pipeline.  Simple int instructions execute in the X, Y, and Z pipelines.
+
+(define_cpu_unit "falkor_x" "falkor")
+(define_cpu_unit "falkor_y" "falkor")
+(define_cpu_unit "falkor_z" "falkor")
+
+;; Branches execute in the B pipeline or in one of the int pipelines depending
+;; on how complex it is.  Simple int insns (like movz) can also execute here.
+
+(define_cpu_unit "falkor_b" "falkor")
+
+;; Vector and FP insns execute in the VX and VY pipelines.
+
+(define_automaton "falkor_vfp")
+
+(define_cpu_unit "falkor_vx" "falkor_vfp")
+(define_cpu_unit "falkor_vy" "falkor_vfp")
+
+;; Loads execute in the LD pipeline.
+;; Stores execute in the ST, SD, and VSD pipelines, for address, data, and
+;; vector data.
+
+(define_automaton "falkor_mem")
+
+(define_cpu_unit "falkor_ld" "falkor_mem")
+(define_cpu_unit "falkor_st" "falkor_mem")
+(define_cpu_unit "falkor_sd" "falkor_mem")
+(define_cpu_unit "falkor_vsd" "falkor_mem")
+
+;; The GTOV and VTOG pipelines are for general to vector reg moves, and vice
+;; versa.
+
+(define_cpu_unit "falkor_gtov" "falkor")
+(define_cpu_unit "falkor_vtog" "falkor")
+
+;; Common reservation combinations.
+
+(define_reservation "falkor_vxvy" "falkor_vx|falkor_vy")
+(define_reservation "falkor_zb"   "falkor_z|falkor_b")
+(define_reservation "falkor_xyz"  "falkor_x|falkor_y|falkor_z")
+(define_reservation "falkor_xyzb" "falkor_x|falkor_y|falkor_z|falkor_b")
+
+;; SIMD Floating-Point Instructions
+
+(define_insn_reservation "falkor_afp_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_neg_s,neon_fp_neg_d,neon_fp_abs_s,neon_fp_abs_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_1_vxvy_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_2_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_minmax_s,neon_fp_minmax_d,neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_d,neon_fp_compare_s,neon_fp_compare_d,neon_fp_round_s,neon_fp_round_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_2_vxvy_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_minmax_s_q,neon_fp_minmax_d_q,neon_fp_compare_s_q,neon_fp_compare_d_q,neon_fp_round_s_q,neon_fp_round_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_3_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_reduc_minmax_s_q,neon_fp_reduc_minmax_d_q,neon_fp_abd_s,neon_fp_abd_d,neon_fp_addsub_s,neon_fp_addsub_d,neon_fp_reduc_add_s,neon_fp_reduc_add_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_3_vxvy_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_abd_s_q,neon_fp_abd_d_q,neon_fp_addsub_s_q,neon_fp_addsub_d_q,neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_4_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_to_int_s,neon_fp_to_int_d,neon_int_to_fp_s,neon_int_to_fp_d,neon_fp_cvt_widen_h,neon_fp_cvt_widen_s"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_4_vxvy_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_to_int_s_q,neon_fp_to_int_d_q,neon_int_to_fp_s_q,neon_int_to_fp_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_5_vxvy_mul" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mul_s,neon_fp_mul_s_scalar"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_5_vxvy_mla" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mla_s,neon_fp_mla_s_scalar"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_5_vxvy_vxvy_mul" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mul_s_q,neon_fp_mul_s_scalar_q"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_5_vxvy_vxvy_mla" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mla_s_q,neon_fp_mla_s_scalar_q"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_6_vxvy_mul" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mul_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_6_vxvy_mla" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mla_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_6_vxvy_vxvy_mul" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mul_d_q,neon_fp_mul_d_scalar_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_6_vxvy_vxvy_mla" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_mla_d_q,neon_fp_mla_d_scalar_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_4_vxvy_vxvy_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_cvt_narrow_s_q,neon_fp_cvt_narrow_d_q"))
+  "falkor_vxvy+falkor_vxvy,falkor_vxvy")
+
+(define_insn_reservation "falkor_afp_6_vx_vy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_div_s"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_afp_11_vx_vy" 11
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_div_d"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_afp_6_vx_vy_vx_vy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_div_s_q"))
+  "(falkor_vx+falkor_vy),(falkor_vx+falkor_vy)")
+
+(define_insn_reservation "falkor_afp_11_vx_vy_vx_vy" 11
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_div_d_q"))
+  "(falkor_vx+falkor_vy),(falkor_vx+falkor_vy)")
+
+(define_insn_reservation "falkor_afp_12_vx_vy" 12
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_sqrt_s"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_afp_22_vx_vy" 22
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_sqrt_d"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_afp_12_vx_vy_vx_vy" 12
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_sqrt_s_q"))
+  "(falkor_vx+falkor_vy),(falkor_vx+falkor_vy)")
+
+(define_insn_reservation "falkor_afp_22_vx_vy_vx_vy" 22
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_sqrt_d_q"))
+  "(falkor_vx+falkor_vy),(falkor_vx+falkor_vy)")
+
+;; SIMD Integer Instructions
+
+(define_insn_reservation "falkor_ai_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_add,neon_reduc_add,neon_logic,neon_neg,neon_sub"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_1_vxvy_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_shift_imm_long,neon_add_q,neon_reduc_add_q,neon_logic_q,neon_neg_q,neon_sub_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_2_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_add_long,neon_sub_long,neon_add_halve,neon_sub_halve,neon_shift_imm,neon_shift_reg,neon_minmax,neon_abs,neon_compare,neon_compare_zero,neon_tst"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_2_vxvy_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_add_halve_q,neon_sub_halve_q,neon_shift_imm_q,neon_shift_reg_q,neon_minmax_q,neon_abs_q,neon_compare_q,neon_compare_zero_q,neon_tst_q,neon_reduc_add_long"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_3_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_shift_acc,neon_reduc_add_acc,neon_abd,neon_qadd,neon_qsub,neon_qabs,neon_qneg,neon_sat_shift_imm,neon_sat_shift_imm_narrow_q,neon_sat_shift_reg,neon_reduc_minmax"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_reduc_minmax_q"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_3_vxvy_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_shift_acc_q,neon_reduc_add_acc_q,neon_abd_q,neon_abd_long,neon_qadd_q,neon_qsub_q,neon_qabs_q,neon_qneg_q,neon_sat_shift_imm_q,neon_sat_shift_reg_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy_mul" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_mul_b,neon_mul_h,neon_mul_s,neon_mul_h_scalar,neon_mul_s_scalar,neon_sat_mul_b,neon_sat_mul_h,neon_sat_mul_s,neon_sat_mul_h_scalar,neon_sat_mul_s_scalar"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy_mla" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_mla_b,neon_mla_h,neon_mla_s,neon_mla_h_scalar,neon_mla_s_scalar"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy_vxvy_mul" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,neon_mul_h_scalar_q,neon_mul_s_scalar_q,neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,neon_mul_d_long,neon_mul_h_scalar_long,neon_mul_s_scalar_long,neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,neon_sat_mul_h_scalar_q,neon_sat_mul_s_scalar_q,neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy_vxvy_mla" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,neon_mla_h_scalar_q,neon_mla_s_scalar_q,neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,neon_mla_h_scalar_long,neon_mla_s_scalar_long,neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_ai_4_vxvy_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_add_halve_narrow_q,neon_sub_halve_narrow_q,neon_arith_acc"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_2_ai_vxvy_vxvy_vxvy_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_add_widen,neon_sub_widen"))
+  "(falkor_vxvy+falkor_vxvy),(falkor_vxvy+falkor_vxvy)")
+
+(define_insn_reservation "falkor_4_ai_vxvy_vxvy_vxvy_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_arith_acc_q"))
+  "(falkor_vxvy+falkor_vxvy),(falkor_vxvy+falkor_vxvy)")
+
+;; SIMD Load Instructions
+
+(define_insn_reservation "falkor_ald_4_ld" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q,neon_load1_all_lanes,neon_load2_one_lane"))
+  "falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_none" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_2reg,neon_load2_2reg,neon_load2_all_lanes"))
+  "falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_ld" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_2reg_q,neon_load2_2reg_q,neon_load2_all_lanes_q,neon_load3_one_lane,neon_load4_one_lane,neon_ldp,neon_ldp_q"))
+  "falkor_ld,falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_ld_none" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_3reg,neon_load3_3reg,neon_load3_all_lanes"))
+  "falkor_ld,falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_ld_ld" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_3reg_q,neon_load3_3reg_q,neon_load3_all_lanes_q"))
+  "falkor_ld,falkor_ld,falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_ld_none_none" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_4reg,neon_load4_4reg"))
+  "falkor_ld,falkor_ld")
+
+(define_insn_reservation "falkor_ald_4_ld_ld_ld_ld" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_load1_4reg_q,neon_load4_4reg_q,neon_load4_all_lanes,neon_load4_all_lanes_q"))
+  "falkor_ld,falkor_ld,falkor_ld,falkor_ld")
+
+;; Arithmetic and Logical Instructions
+
+(define_insn_reservation "falkor_alu_1_xyz" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "alus_sreg,alus_imm,alus_shift_imm,csel,adc_reg,alu_imm,alu_sreg,alu_shift_imm,alu_ext,alus_ext,logic_imm,logic_reg,logic_shift_imm,logics_imm,logics_reg,logics_shift_imm,mov_reg"))
+  "falkor_xyz")
+
+;; SIMD Miscellaneous Instructions
+
+;; No separate type for ins and dup.  But this is correct for both.
+
+(define_insn_reservation "falkor_am_3_gtov" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_from_gp"))
+  "falkor_gtov")
+
+;; No separate type for ins and dup.  Assuming dup is more common.  Ins is
+;; gtov+vxvy and latency of 4.
+
+(define_insn_reservation "falkor_am_3_gtov_gtov" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_from_gp_q"))
+  "falkor_gtov,falkor_gtov")
+
+;; neon_to_gp_q is used for 32-bit ARM instructions that move 64-bits of data
+;; so no use needed here.
+
+(define_insn_reservation "falkor_am_3_vtog" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_to_gp"))
+  "falkor_vtog")
+
+(define_insn_reservation "falkor_am_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_bsl,neon_dup,neon_ext,neon_ins,neon_ins_q,neon_move,neon_rev,neon_tbl1,neon_permute,neon_shift_imm_narrow_q"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_am_1_vxvy_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_bsl_q,neon_dup_q,neon_ext_q,neon_move_q,neon_rev_q,neon_tbl1_q,neon_permute_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_am_2_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_cls,neon_cnt,neon_rbit"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_am_4_vxvy_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_cls_q,neon_cnt_q,neon_rbit_q,neon_tbl2"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_am_3_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recpe_s,neon_fp_recpe_d,neon_fp_rsqrte_s,neon_fp_rsqrte_d,neon_fp_recpx_s,neon_fp_recpx_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_am_3_vxvy_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recpe_s_q,neon_fp_recpe_d_q,neon_fp_rsqrte_s_q,neon_fp_rsqrte_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_am_5_vxvy" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recps_s"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_am_5_vxvy_vxvy" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recps_s_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_am_6_vxvy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recps_d,neon_fp_rsqrts_d"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_am_6_vxvy_vxvy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_fp_recps_d_q,neon_fp_rsqrts_d_q"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_am_5_vxvy_vxvy_vxvy" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_tbl2_q,neon_tbl3"))
+  "(falkor_vxvy+falkor_vxvy),falkor_vxvy")
+
+(define_insn_reservation "falkor_am_6_vxvy_vxvy_vxvy_vxvy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_tbl3_q,neon_tbl4"))
+  "(falkor_vxvy+falkor_vxvy),(falkor_vxvy+falkor_vxvy)")
+
+(define_insn_reservation "falkor_am_7_vxvy_vxvy_vxvy_vxvy_vxvy" 7
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_tbl4_q"))
+  "(falkor_vxvy+falkor_vxvy),(falkor_vxvy+falkor_vxvy),falkor_vxvy")
+
+;; SIMD Store Instructions
+
+;; ??? stp is neon_store1_2reg in aarch64.md, but neon_stp in aarch64-simd.md.
+;; Similarly with ldp.
+
+(define_insn_reservation "falkor_ast_st_vsd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_store1_1reg,neon_store1_1reg_q,neon_store1_one_lane,neon_store1_one_lane_q,neon_store1_2reg,neon_store2_2reg,neon_store2_one_lane,neon_store2_one_lane_q,neon_stp"))
+  "falkor_st+falkor_vsd")
+
+(define_insn_reservation "falkor_as_0_st_vsd_st_vsd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_store1_2reg_q,neon_store1_3reg,neon_store1_4reg,neon_store2_2reg_q,neon_store3_3reg,neon_store4_4reg,neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_one_lane,neon_store4_one_lane_q,neon_stp_q"))
+  "(falkor_st+falkor_vsd),(falkor_st+falkor_vsd)")
+
+(define_insn_reservation "falkor_as_0_st_vsd_st_vsd_st_vsd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_store1_3reg_q,neon_store3_3reg_q"))
+  "(falkor_st+falkor_vsd),(falkor_st+falkor_vsd),(falkor_st+falkor_vsd)")
+
+(define_insn_reservation "falkor_as_0_st_vsd_st_vsd_st_vsd_st_vsd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "neon_store1_4reg_q,neon_store4_4reg_q"))
+  "(falkor_st+falkor_vsd),(falkor_st+falkor_vsd),(falkor_st+falkor_vsd),(falkor_st+falkor_vsd)")
+
+;; Branch Instructions
+
+(define_insn_reservation "falkor_branch_0_zb" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "branch"))
+  "falkor_zb")
+
+(define_insn_reservation "falkor_call_0_xyzb" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "call"))
+  "falkor_xyzb")
+
+;; Cryptography Extensions
+
+(define_insn_reservation "falkor_cry_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_sha1_fast"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_cry_2_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_aesmc"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_cry_2_vxvy_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_sha1_xor,crypto_sha256_fast,crypto_pmull"))
+  "falkor_vxvy+falkor_vxvy")
+
+(define_insn_reservation "falkor_cry_4_vy_vx" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_sha1_slow"))
+  "falkor_vy+falkor_vx")
+
+(define_insn_reservation "falkor_cry_6_vy_vx" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_sha256_slow"))
+  "falkor_vy+falkor_vx")
+
+(define_insn_reservation "falkor_cry_3_vxvy_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crypto_aese"))
+  "falkor_vxvy+falkor_vxvy")
+
+;; FP Load Instructions
+
+(define_insn_reservation "falkor_fld_4_ld" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_loads,f_loadd"))
+  "falkor_ld")
+
+;; No separate FP store section, these are found in the SIMD store section.
+
+(define_insn_reservation "falkor_fld_0_st_vsd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_stores,f_stored"))
+  "falkor_st+falkor_vsd")
+
+;; FP Data Processing Instructions
+
+(define_insn_reservation "falkor_fpdt_0_vxvy" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fcmps,fcmpd,fccmps,fccmpd"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_5_vtog" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_cvtf2i"))
+  "falkor_vtog")
+
+(define_insn_reservation "falkor_fpdt_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "ffariths,ffarithd,fcsel"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_2_vxvy" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_minmaxd,f_minmaxs,f_rintd,f_rints"))
+  "falkor_vxvy")
+
+;; Scalar FP ABD is handled same as vector FP ABD.
+
+(define_insn_reservation "falkor_fpdt_3_vxvy" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "faddd,fadds"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_4_vxvy" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_cvt"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_5_vxvy_mul" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fmuls"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_5_vxvy_mla" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fmacs,ffmas"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_6_vxvy_mul" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fmuld"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_6_vxvy_mla" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fmacd,ffmad"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_6_vx_vy" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fdivs"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_fpdt_11_vx_vy" 11
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fdivd"))
+  "falkor_vx+falkor_vy")
+
+(define_insn_reservation "falkor_fpdt_12_vx_vy" 12
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fsqrts"))
+  "falkor_vxvy")
+
+(define_insn_reservation "falkor_fpdt_22_vx_vy" 22
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fsqrtd"))
+  "falkor_vxvy")
+
+;; FP Miscellaneous Instructions
+
+(define_insn_reservation "falkor_fpmsc_3_vtog" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_mrc"))
+  "falkor_vtog")
+
+(define_insn_reservation "falkor_fpmsc_3_gtov" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_mcr"))
+  "falkor_gtov")
+
+(define_insn_reservation "falkor_fpmsc_1_vxvy" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "fmov,fconsts,fconstd"))
+  "falkor_vxvy")
+
+;; No separate type for float-to-fixed conversions.  Same type as
+;; float-to-int conversions.  They schedule the same though, so no problem.
+
+(define_insn_reservation "falkor_fpmsc_6_gtov" 6
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "f_cvti2f"))
+  "falkor_gtov")
+
+;; Load Instructions
+
+(define_insn_reservation "falkor_ld_3_ld" 3
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "load1,load2"))
+  "falkor_ld")
+
+;; Miscellaneous Data-Processing Instructions
+
+(define_insn_reservation "falkor_misc_1_xyz" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "bfx,bfm,extend,rotate_imm,shift_imm"))
+  "falkor_xyz")
+
+(define_insn_reservation "falkor_misc_2_x" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "crc"))
+  "falkor_x")
+
+(define_insn_reservation "falkor_misc_2_xyz" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "clz,rbit,rev"))
+  "falkor_xyz")
+
+;; Divide and Multiply Instructions
+
+(define_insn_reservation "falkor_muldiv_4_x_mul" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "mul"))
+  "falkor_x")
+
+(define_insn_reservation "falkor_muldiv_4_x_mla" 4
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "mla,smlal,umlal"))
+  "falkor_x")
+
+(define_insn_reservation "falkor_muldiv_5_x_mul" 5
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "smull,umull"))
+  "falkor_x")
+
+(define_insn_reservation "falkor_md_11_x_z" 11
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "sdiv,udiv"))
+  "falkor_x+falkor_z")
+
+;; Move and Shift Instructions
+
+(define_insn_reservation "falkor_mvs_1_xyz" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "mov_imm,shift_reg"))
+  "falkor_xyz")
+
+(define_insn_reservation "falkor_mvs_1_xyzb" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "adr"))
+  "falkor_xyzb")
+
+;; Other Instructions
+
+;; Block is for instruction scheduling blockage insns in RTL.  There are no
+;; hardware instructions emitted for them, so don't use any resources.
+
+(define_insn_reservation "falkor_other_0_nothing" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "no_insn,trap,block"))
+  "nothing")
+
+(define_insn_reservation "falkor_other_2_z" 2
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "mrs"))
+  "falkor_z")
+
+;; Assume multiple instructions use all pipes.
+
+(define_insn_reservation "falkor_extra" 1
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "multiple"))
+  "falkor_x+falkor_y+falkor_z+falkor_b+falkor_vx+falkor_vy+falkor_ld+falkor_st+falkor_sd+falkor_vsd+falkor_gtov+falkor_vtog")
+
+;; Store Instructions
+
+;; No use of store_rel, store3, or store4 in aarch64.
+
+(define_insn_reservation "falkor_st_0_st_sd" 0
+  (and (eq_attr "tune" "falkor")
+       (eq_attr "type" "store1,store2"))
+  "falkor_st+falkor_sd")
+
+;; Muliply bypasses.
+
+;; 1 cycle latency (0 bubble) for an integer mul or mac feeding into a mac.
+
+(define_bypass 1
+  "falkor_ai_4_vxvy_mul,falkor_ai_4_vxvy_mla,falkor_ai_4_vxvy_vxvy_mul,falkor_ai_4_vxvy_vxvy_mla,falkor_muldiv_4_x_mul,falkor_muldiv_4_x_mla,falkor_muldiv_5_x_mul"
+  "falkor_ai_4_vxvy_mla,falkor_ai_4_vxvy_vxvy_mla,falkor_muldiv_4_x_mla")
+
+;; 3 cycle latency (2 bubbles) for an FP mul or mac feeding into a mac.
+
+(define_bypass 3
+  "falkor_afp_5_vxvy_mul,falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mul,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mul,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mul,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mul,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mul,falkor_fpdt_6_vxvy_mla"
+  "falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mla")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 43be7fd3611..cceb57525c7 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -44,6 +44,9 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_F16 [(HF "AARCH64_ISA_F16") SF DF])
 
+;; Iterator for all scalar floating point modes (HF, SF, DF)
+(define_mode_iterator GPF_HF [HF SF DF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
@@ -520,6 +523,17 @@
 			(SI   "SI") (HI   "HI")
 			(QI   "QI")])
 
+;; Define element mode for each vector mode (lower case).
+(define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
+			(V4HI "hi") (V8HI "hi")
+			(V2SI "si") (V4SI "si")
+			(DI "di")   (V2DI "di")
+			(V4HF "hf") (V8HF "hf")
+			(V2SF "sf") (V4SF "sf")
+			(V2DF "df") (DF "df")
+			(SI   "si") (HI   "hi")
+			(QI   "qi")])
+
 ;; 64-bit container modes the inner or scalar source mode.
 (define_mode_attr VCOND [(HI "V4HI") (SI "V2SI")
 			 (V4HI "V4HI") (V8HI "V4HI")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index ad8a43c2b2c..11243c4ce00 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -114,6 +114,10 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_logical_immediate")))
 
+(define_predicate "aarch64_mov_imm_operand"
+  (and (match_code "const_int")
+       (match_test "aarch64_move_imm (INTVAL (op), mode)")))
+
 (define_predicate "aarch64_logical_and_immediate"
   (and (match_code "const_int")
        (match_test "aarch64_and_bitmask_imm (INTVAL (op), mode)")))
diff --git a/gcc/config/aarch64/rtems.h b/gcc/config/aarch64/rtems.h
index b48e28afda0..07c5679d5c1 100644
--- a/gcc/config/aarch64/rtems.h
+++ b/gcc/config/aarch64/rtems.h
@@ -1,20 +1,25 @@
 /* Definitions for RTEMS based AARCH64 system.
    Copyright (C) 2016-2017 Free Software Foundation, Inc.
- 
+
    This file is part of GCC.
- 
+
    GCC is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published
    by the Free Software Foundation; either version 3, or (at your
    option) any later version.
- 
+
    GCC is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
    License for more details.
- 
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
 #define HAS_INIT_SECTION