aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/arm
diff options
context:
space:
mode:
authorno-author <no-author@gcc.gnu.org>2004-02-12 20:15:57 +0000
committerno-author <no-author@gcc.gnu.org>2004-02-12 20:15:57 +0000
commit5ce35163972a168c33c34b1ed753d16747f73ef3 (patch)
tree03bdff41b9946151953d965b51d3352fd8baf64d /gcc/config/arm
parente8461705deb66941c2b0ae4ae0ed44637b44c345 (diff)
This commit was manufactured by cvs2svn to create branch
'tree-ssa-20020619-branch'. git-svn-id: https://gcc.gnu.org/svn/gcc/branches/tree-ssa-20020619-branch@77725 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/arm')
-rw-r--r--gcc/config/arm/arm-cores.def87
-rw-r--r--gcc/config/arm/arm-generic.md152
-rw-r--r--gcc/config/arm/arm1026ejs.md241
-rw-r--r--gcc/config/arm/arm1136jfs.md377
-rw-r--r--gcc/config/arm/arm926ejs.md188
-rw-r--r--gcc/config/arm/vfp.md744
6 files changed, 1789 insertions, 0 deletions
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
new file mode 100644
index 00000000000..774ba6f10f2
--- /dev/null
+++ b/gcc/config/arm/arm-cores.def
@@ -0,0 +1,87 @@
+/* ARM CPU Cores
+ Copyright (C) 2003 Free Software Foundation, Inc.
+ Written by CodeSourcery, LLC
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+ 02111-1307, USA. */
+
+/* Before using #include to read this file, define a macro:
+
+ ARM_CORE(CORE_NAME, FLAGS)
+
+ The CORE_NAME is the name of the core, represented as an identifier
+ rather than a string constant. The FLAGS are the bitwise-or of the
+ traits that apply to that core.
+
+ If you update this table, you must update the "tune" attribute in
+ arm.md. */
+
+ARM_CORE(arm2, FL_CO_PROC | FL_MODE26, slowmul)
+ARM_CORE(arm250, FL_CO_PROC | FL_MODE26, slowmul)
+ARM_CORE(arm3, FL_CO_PROC | FL_MODE26, slowmul)
+ARM_CORE(arm6, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm60, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm600, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm610, FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm620, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm7, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+/* arm7m doesn't exist on its own, but only with D, (and I), but
+ those don't alter the code, so arm7m is sometimes used. */
+ARM_CORE(arm7m, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul)
+ARM_CORE(arm7d, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm7dm, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul)
+ARM_CORE(arm7di, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm7dmi, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul)
+ARM_CORE(arm70, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm700, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm700i, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm710, FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm720, FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm710c, FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm7100, FL_MODE26 | FL_MODE32, slowmul)
+ARM_CORE(arm7500, FL_MODE26 | FL_MODE32, slowmul)
+/* Doesn't have an external co-proc, but does have embedded fpa. */
+ARM_CORE(arm7500fe, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul)
+/* V4 Architecture Processors */
+ARM_CORE(arm7tdmi, FL_CO_PROC | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul)
+ARM_CORE(arm710t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul)
+ARM_CORE(arm720t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul)
+ARM_CORE(arm740t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul)
+ARM_CORE(arm8, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul)
+ARM_CORE(arm810, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul)
+ARM_CORE(arm9, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul)
+ARM_CORE(arm920, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul)
+ARM_CORE(arm920t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul)
+ARM_CORE(arm940t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul)
+ARM_CORE(arm9tdmi, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul)
+ARM_CORE(arm9e, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, 9e)
+
+ARM_CORE(ep9312, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_CIRRUS, fastmul)
+ARM_CORE(strongarm, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE(strongarm110, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE(strongarm1100, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul)
+ARM_CORE(strongarm1110, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul)
+/* V5 Architecture Processors */
+ARM_CORE(arm10tdmi, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_ARCH5, fastmul)
+ARM_CORE(arm1020t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_ARCH5, fastmul)
+ARM_CORE(arm926ejs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E, 9e)
+ARM_CORE(arm1026ejs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E, 9e)
+ARM_CORE(xscale, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_STRONG | FL_ARCH5 | FL_ARCH5E | FL_XSCALE, xscale)
+ARM_CORE(iwmmxt, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_STRONG | FL_ARCH5 | FL_ARCH5E | FL_XSCALE | FL_IWMMXT, xscale)
+/* V6 Architecture Processors */
+ARM_CORE(arm1136js, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E | FL_ARCH6, 9e)
+ARM_CORE(arm1136jfs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E | FL_ARCH6 | FL_VFPV2, 9e)
diff --git a/gcc/config/arm/arm-generic.md b/gcc/config/arm/arm-generic.md
new file mode 100644
index 00000000000..ec2df47b465
--- /dev/null
+++ b/gcc/config/arm/arm-generic.md
@@ -0,0 +1,152 @@
+;; Generic ARM Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA. */
+
+(define_automaton "arm")
+
+;; Write buffer
+;
+; Strictly, we should model a 4-deep write buffer for ARM7xx based chips
+;
+; The write buffer on some of the arm6 processors is hard to model exactly.
+; There is room in the buffer for up to two addresses and up to eight words
+; of memory, but the two needn't be split evenly. When writing the two
+; addresses are fully pipelined. However, a read from memory that is not
+; currently in the cache will block until the writes have completed.
+; It is normally the case that FCLK and MCLK will be in the ratio 2:1, so
+; writes will take 2 FCLK cycles per word, if FCLK and MCLK are asynchronous
+; (they aren't allowed to be at present) then there is a startup cost of 1MCLK
+; cycle to add as well.
+(define_cpu_unit "write_buf" "arm")
+
+;; Write blockage unit
+;
+; The write_blockage unit models (partially), the fact that reads will stall
+; until the write buffer empties.
+; The f_mem_r and r_mem_f could also block, but they are to the stack,
+; so we don't model them here
+(define_cpu_unit "write_blockage" "arm")
+
+;; Core
+;
+(define_cpu_unit "core" "arm")
+
+(define_insn_reservation "r_mem_f_wbuf" 5
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "yes")
+ (eq_attr "type" "r_mem_f")))
+ "core+write_buf*3")
+
+(define_insn_reservation "store_wbuf" 5
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "yes")
+ (eq_attr "type" "store1")))
+ "core+write_buf*3+write_blockage*5")
+
+(define_insn_reservation "store2_wbuf" 7
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "yes")
+ (eq_attr "type" "store2")))
+ "core+write_buf*4+write_blockage*7")
+
+(define_insn_reservation "store3_wbuf" 9
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "yes")
+ (eq_attr "type" "store3")))
+ "core+write_buf*5+write_blockage*9")
+
+(define_insn_reservation "store4_wbuf" 11
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "yes")
+ (eq_attr "type" "store4")))
+ "core+write_buf*6+write_blockage*11")
+
+(define_insn_reservation "store2" 3
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "no")
+ (eq_attr "type" "store2")))
+ "core*3")
+
+(define_insn_reservation "store3" 4
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "no")
+ (eq_attr "type" "store3")))
+ "core*4")
+
+(define_insn_reservation "store4" 5
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "model_wbuf" "no")
+ (eq_attr "type" "store4")))
+ "core*5")
+
+(define_insn_reservation "store_ldsched" 1
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "yes")
+ (eq_attr "type" "store1")))
+ "core")
+
+(define_insn_reservation "load_ldsched_xscale" 3
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "yes")
+ (and (eq_attr "type" "load_byte,load1")
+ (eq_attr "is_xscale" "yes"))))
+ "core")
+
+(define_insn_reservation "load_ldsched" 2
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "yes")
+ (and (eq_attr "type" "load_byte,load1")
+ (eq_attr "is_xscale" "no"))))
+ "core")
+
+(define_insn_reservation "load_or_store" 2
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "!yes")
+ (eq_attr "type" "load_byte,load1,load2,load3,load4,store1")))
+ "core*2")
+
+(define_insn_reservation "mult" 16
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "no") (eq_attr "type" "mult")))
+ "core*16")
+
+(define_insn_reservation "mult_ldsched_strongarm" 3
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "yes")
+ (and (eq_attr "is_strongarm" "yes")
+ (eq_attr "type" "mult"))))
+ "core*2")
+
+(define_insn_reservation "mult_ldsched" 4
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "ldsched" "yes")
+ (and (eq_attr "is_strongarm" "no")
+ (eq_attr "type" "mult"))))
+ "core*4")
+
+(define_insn_reservation "multi_cycle" 32
+ (and (eq_attr "generic_sched" "yes")
+ (and (eq_attr "core_cycles" "multi")
+ (eq_attr "type" "!mult,load_byte,load1,load2,load3,load4,store1,store2,store3,store4")))
+ "core*32")
+
+(define_insn_reservation "single_cycle" 1
+ (and (eq_attr "generic_sched" "yes")
+ (eq_attr "core_cycles" "single"))
+ "core")
diff --git a/gcc/config/arm/arm1026ejs.md b/gcc/config/arm/arm1026ejs.md
new file mode 100644
index 00000000000..77f8fde2ccf
--- /dev/null
+++ b/gcc/config/arm/arm1026ejs.md
@@ -0,0 +1,241 @@
+;; ARM 1026EJ-S Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA. */
+
+;; These descriptions are based on the information contained in the
+;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM
+;; Limited.
+;;
+
+;; This automaton provides a pipeline description for the ARM
+;; 1026EJ-S core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "arm1026ejs")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There are two pipelines:
+;;
+;; - An Arithmetic Logic Unit (ALU) pipeline.
+;;
+;; The ALU pipeline has fetch, issue, decode, execute, memory, and
+;; write stages. We only need to model the execute, memory and write
+;; stages.
+;;
+;; - A Load-Store Unit (LSU) pipeline.
+;;
+;; The LSU pipeline has decode, execute, memory, and write stages.
+;; We only model the execute, memory and write stages.
+
+(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs")
+(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages. The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles. That case is not modeled here.
+
+;; ALU operations with no shifted operand
+(define_insn_reservation "alu_op" 1
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "alu"))
+ "a_e,a_m,a_w")
+
+;; ALU operations with a shift-by-constant operand
+(define_insn_reservation "alu_shift_op" 1
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "alu_shift"))
+ "a_e,a_m,a_w")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the execute stage.
+(define_insn_reservation "alu_shift_reg_op" 2
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "alu_shift_reg"))
+ "a_e*2,a_m,a_w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times.
+
+;; The result of the "smul" and "smulw" instructions is not available
+;; until after the memory stage.
+(define_insn_reservation "mult1" 2
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "smulxy,smulwy"))
+ "a_e,a_m,a_w")
+
+;; The "smlaxy" and "smlawx" instructions require two iterations through
+;; the execute stage; the result is available immediately following
+;; the execute stage.
+(define_insn_reservation "mult2" 2
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "smlaxy,smlalxy,smlawx"))
+ "a_e*2,a_m,a_w")
+
+;; The "smlalxy", "mul", and "mla" instructions require two iterations
+;; through the execute stage; the result is not available until after
+;; the memory stage.
+(define_insn_reservation "mult3" 3
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "smlalxy,mul,mla"))
+ "a_e*2,a_m,a_w")
+
+;; The "muls" and "mlas" instructions loop in the execute stage for
+;; four iterations in order to set the flags. The value result is
+;; available after three iterations.
+(define_insn_reservation "mult4" 3
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "muls,mlas"))
+ "a_e*4,a_m,a_w")
+
+;; Long multiply instructions that produce two registers of
+;; output (such as umull) make their results available in two cycles;
+;; the least significant word is available before the most significant
+;; word. That fact is not modeled; instead, the instructions are
+;; described.as if the entire result was available at the end of the
+;; cycle in which both words are available.
+
+;; The "umull", "umlal", "smull", and "smlal" instructions all take
+;; three iterations through the execute cycle, and make their results
+;; available after the memory cycle.
+(define_insn_reservation "mult5" 4
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "umull,umlal,smull,smlal"))
+ "a_e*3,a_m,a_w")
+
+;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in
+;; the execute stage for five iterations in order to set the flags.
+;; The value result is available after four iterations.
+(define_insn_reservation "mult6" 4
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "insn" "umulls,umlals,smulls,smlals"))
+ "a_e*5,a_m,a_w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!"). These models assume that all memory references
+;; hit in dcache.
+
+;; LSU instructions require six cycles to execute. They use the ALU
+;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles
+;; three through six.
+;; Loads and stores which use a scaled register offset or scaled
+;; register pre-indexed addressing mode take three cycles EXCEPT for
+;; those that are base + offset with LSL of 0 or 2, or base - offset
+;; with LSL of zero. The remainder take 1 cycle to execute.
+;; For 4byte loads there is a bypass from the load stage
+
+(define_insn_reservation "load1_op" 2
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "load_byte,load1"))
+ "a_e+l_e,l_m,a_w+l_w")
+
+(define_insn_reservation "store1_op" 0
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "store1"))
+ "a_e+l_e,l_m,a_w+l_w")
+
+;; A load's result can be stored by an immediately following store
+(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep")
+
+;; On a LDM/STM operation, the LSU pipeline iterates until all of the
+;; registers have been processed.
+;;
+;; The time it takes to load the data depends on whether or not the
+;; base address is 64-bit aligned; if it is not, an additional cycle
+;; is required. This model assumes that the address is always 64-bit
+;; aligned. Because the processor can load two registers per cycle,
+;; that assumption means that we use the same instruction reservations
+;; for loading 2k and 2k - 1 registers.
+;;
+;; The ALU pipeline is stalled until the completion of the last memory
+;; stage in the LSU pipeline. That is modeled by keeping the ALU
+;; execute stage busy until that point.
+;;
+;; As with ALU operations, if one of the destination registers is the
+;; PC, there are additional stalls; that is not modeled.
+
+(define_insn_reservation "load2_op" 2
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "load2"))
+ "a_e+l_e,l_m,a_w+l_w")
+
+(define_insn_reservation "store2_op" 0
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "store2"))
+ "a_e+l_e,l_m,a_w+l_w")
+
+(define_insn_reservation "load34_op" 3
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "load3,load4"))
+ "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w")
+
+(define_insn_reservation "store34_op" 0
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "store3,store4"))
+ "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately. The ARM
+;; core can predict most branches. If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream. Some branches can
+;; therefore appear to require zero cycles to execute. We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "branch_op" 0
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "branch"))
+ "nothing")
+
+;; The latency for a call is not predictable. Therefore, we use 32 as
+;; roughly equivalent to positive infinity.
+
+(define_insn_reservation "call_op" 32
+ (and (eq_attr "tune" "arm1026ejs")
+ (eq_attr "type" "call"))
+ "nothing")
diff --git a/gcc/config/arm/arm1136jfs.md b/gcc/config/arm/arm1136jfs.md
new file mode 100644
index 00000000000..2c0638c0524
--- /dev/null
+++ b/gcc/config/arm/arm1136jfs.md
@@ -0,0 +1,377 @@
+;; ARM 1136J[F]-S Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA. */
+
+;; These descriptions are based on the information contained in the
+;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM
+;; Limited.
+;;
+
+;; This automaton provides a pipeline description for the ARM
+;; 1136J-S and 1136JF-S cores.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "arm1136jfs")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There are three distinct pipelines (page 1-26 and following):
+;;
+;; - A 4-stage decode pipeline, shared by all three. It has fetch (1),
+;; fetch (2), decode, and issue stages. Since this is always involved,
+;; we do not model it in the scheduler.
+;;
+;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations),
+;; and saturation stages. The fourth stage is writeback; see below.
+;;
+;; - A 4-stage multiply-accumulate pipeline. It has three stages, called
+;; MAC1 through MAC3, and a fourth writeback stage.
+;;
+;; The 4th-stage writeback is shared between the ALU and MAC pipelines,
+;; which operate in lockstep. Results from either pipeline will be
+;; moved into the writeback stage. Because the two pipelines operate
+;; in lockstep, we schedule them as a single "execute" pipeline.
+;;
+;; - A 4-stage LSU pipeline. It has address generation, data cache (1),
+;; data cache (2), and writeback stages. (Note that this pipeline,
+;; including the writeback stage, is independent from the ALU & LSU pipes.)
+
+(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC
+; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3
+(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require eight cycles to execute, and use the ALU
+;; pipeline in each of the eight stages. The results are available
+;; after the alu stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles. That case is not modelled here.
+
+;; ALU operations with no shifted operand
+(define_insn_reservation "11_alu_op" 2
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "alu"))
+ "e_1,e_2,e_3,e_wb")
+
+;; ALU operations with a shift-by-constant operand
+(define_insn_reservation "11_alu_shift_op" 2
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "alu_shift"))
+ "e_1,e_2,e_3,e_wb")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the shift stage.
+(define_insn_reservation "11_alu_shift_reg_op" 3
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "alu_shift_reg"))
+ "e_1*2,e_2,e_3,e_wb")
+
+;; alu_ops can start sooner, if there is no shifter dependency
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_alu_op")
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_alu_op")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the first two execute stages until
+;; the instruction has been passed through the multiplier array enough
+;; times.
+
+;; Multiply and multiply-accumulate results are available after four stages.
+(define_insn_reservation "11_mult1" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "mul,mla"))
+ "e_1*2,e_2,e_3,e_wb")
+
+;; The *S variants set the condition flags, which requires three more cycles.
+(define_insn_reservation "11_mult2" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "muls,mlas"))
+ "e_1*2,e_2,e_3,e_wb")
+
+(define_bypass 3 "11_mult1,11_mult2"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 3 "11_mult1,11_mult2"
+ "11_alu_op")
+(define_bypass 3 "11_mult1,11_mult2"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 3 "11_mult1,11_mult2"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 3 "11_mult1,11_mult2"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+
+;; Signed and unsigned multiply long results are available across two cycles;
+;; the less significant word is available one cycle before the more significant
+;; word. Here we conservatively wait until both are available, which is
+;; after three iterations and the memory cycle. The same is also true of
+;; the two multiply-accumulate instructions.
+(define_insn_reservation "11_mult3" 5
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "smull,umull,smlal,umlal"))
+ "e_1*3,e_2,e_3,e_wb*2")
+
+;; The *S variants set the condition flags, which requires three more cycles.
+(define_insn_reservation "11_mult4" 5
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "smulls,umulls,smlals,umlals"))
+ "e_1*3,e_2,e_3,e_wb*2")
+
+(define_bypass 4 "11_mult3,11_mult4"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 4 "11_mult3,11_mult4"
+ "11_alu_op")
+(define_bypass 4 "11_mult3,11_mult4"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 4 "11_mult3,11_mult4"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 4 "11_mult3,11_mult4"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+
+;; Various 16x16->32 multiplies and multiply-accumulates, using combinations
+;; of high and low halves of the argument registers. They take a single
+;; pass through the pipeline and make the result available after three
+;; cycles.
+(define_insn_reservation "11_mult5" 3
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))
+ "e_1,e_2,e_3,e_wb")
+
+(define_bypass 2 "11_mult5"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 2 "11_mult5"
+ "11_alu_op")
+(define_bypass 2 "11_mult5"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 2 "11_mult5"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 2 "11_mult5"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+
+;; The same idea, then the 32-bit result is added to a 64-bit quantity.
+(define_insn_reservation "11_mult6" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "smlalxy"))
+ "e_1*2,e_2,e_3,e_wb*2")
+
+;; Signed 32x32 multiply, then the most significant 32 bits are extracted
+;; and are available after the memory stage.
+(define_insn_reservation "11_mult7" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "insn" "smmul,smmulr"))
+ "e_1*2,e_2,e_3,e_wb")
+
+(define_bypass 3 "11_mult6,11_mult7"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 3 "11_mult6,11_mult7"
+ "11_alu_op")
+(define_bypass 3 "11_mult6,11_mult7"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 3 "11_mult6,11_mult7"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 3 "11_mult6,11_mult7"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; These vary greatly depending on their arguments and the results of
+;; stat prediction. Cycle count ranges from zero (unconditional branch,
+;; folded dynamic prediction) to seven (incorrect predictions, etc). We
+;; assume an optimal case for now, because the cost of a cache miss
+;; overwhelms the cost of everything else anyhow.
+
+(define_insn_reservation "11_branches" 0
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "branch"))
+ "nothing")
+
+;; Call latencies are not predictable. A semi-arbitrary very large
+;; number is used as "positive infinity" so that everything should be
+;; finished by the time of return.
+(define_insn_reservation "11_call" 32
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "call"))
+ "nothing")
+
+;; Branches are predicted. A correctly predicted branch will be no
+;; cost, but we're conservative here, and use the timings a
+;; late-register would give us.
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_branches")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_branches")
+(define_bypass 2 "11_load1,11_load2"
+ "11_branches")
+(define_bypass 3 "11_load34"
+ "11_branches")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback.
+;; These models assume that all memory references hit in dcache. Also,
+;; if the PC is one of the registers involved, there are additional stalls
+;; not modelled here. Addressing modes are also not modelled.
+
+(define_insn_reservation "11_load1" 3
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "load1"))
+ "l_a+e_1,l_dc1,l_dc2,l_wb")
+
+;; Load byte results are not available until the writeback stage, where
+;; the correct byte is extracted.
+
+(define_insn_reservation "11_loadb" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "load_byte"))
+ "l_a+e_1,l_dc1,l_dc2,l_wb")
+
+(define_insn_reservation "11_store1" 0
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "store1"))
+ "l_a+e_1,l_dc1,l_dc2,l_wb")
+
+;; Load/store double words into adjacent registers. The timing and
+;; latencies are different depending on whether the address is 64-bit
+;; aligned. This model assumes that it is.
+(define_insn_reservation "11_load2" 3
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "load2"))
+ "l_a+e_1,l_dc1,l_dc2,l_wb")
+
+(define_insn_reservation "11_store2" 0
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "store2"))
+ "l_a+e_1,l_dc1,l_dc2,l_wb")
+
+;; Load/store multiple registers. Two registers are stored per cycle.
+;; Actual timing depends on how many registers are affected, so we
+;; optimistically schedule a low latency.
+(define_insn_reservation "11_load34" 4
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "load3,load4"))
+ "l_a+e_1,l_dc1*2,l_dc2,l_wb")
+
+(define_insn_reservation "11_store34" 0
+ (and (eq_attr "tune" "arm1136js,arm1136jfs")
+ (eq_attr "type" "store3,store4"))
+ "l_a+e_1,l_dc1*2,l_dc2,l_wb")
+
+;; A store can start immediately after an alu op, if that alu op does
+;; not provide part of the address to access.
+(define_bypass 1 "11_alu_op,11_alu_shift_op"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+(define_bypass 2 "11_alu_shift_reg_op"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+
+;; An alu op can start sooner after a load, if that alu op does not
+;; have an early register dependency on the load
+(define_bypass 2 "11_load1"
+ "11_alu_op")
+(define_bypass 2 "11_load1"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 2 "11_load1"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+
+(define_bypass 3 "11_loadb"
+ "11_alu_op")
+(define_bypass 3 "11_loadb"
+ "11_alu_shift_op"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 3 "11_loadb"
+ "11_alu_shift_reg_op"
+ "arm_no_early_alu_shift_dep")
+
+;; A mul op can start sooner after a load, if that mul op does not
+;; have an early multiply dependency
+(define_bypass 2 "11_load1"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 3 "11_load34"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+(define_bypass 3 "11_loadb"
+ "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
+ "arm_no_early_mul_dep")
+
+;; A store can start sooner after a load, if that load does not
+;; produce part of the address to access
+(define_bypass 2 "11_load1"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
+(define_bypass 3 "11_loadb"
+ "11_store1"
+ "arm_no_early_store_addr_dep")
diff --git a/gcc/config/arm/arm926ejs.md b/gcc/config/arm/arm926ejs.md
new file mode 100644
index 00000000000..258495b7f06
--- /dev/null
+++ b/gcc/config/arm/arm926ejs.md
@@ -0,0 +1,188 @@
+;; ARM 926EJ-S Pipeline Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA. */
+
+;; These descriptions are based on the information contained in the
+;; ARM926EJ-S Technical Reference Manual, Copyright (c) 2002 ARM
+;; Limited.
+;;
+
+;; This automaton provides a pipeline description for the ARM
+;; 926EJ-S core.
+;;
+;; The model given here assumes that the condition for all conditional
+;; instructions is "true", i.e., that all of the instructions are
+;; actually executed.
+
+(define_automaton "arm926ejs")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipelines
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; There is a single pipeline
+;;
+;; The ALU pipeline has fetch, decode, execute, memory, and
+;; write stages. We only need to model the execute, memory and write
+;; stages.
+
+(define_cpu_unit "e,m,w" "arm926ejs")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ALU Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; ALU instructions require three cycles to execute, and use the ALU
+;; pipeline in each of the three stages. The results are available
+;; after the execute stage stage has finished.
+;;
+;; If the destination register is the PC, the pipelines are stalled
+;; for several cycles. That case is not modeled here.
+
+;; ALU operations with no shifted operand
+(define_insn_reservation "9_alu_op" 1
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "alu,alu_shift"))
+ "e,m,w")
+
+;; ALU operations with a shift-by-register operand
+;; These really stall in the decoder, in order to read
+;; the shift value in a second cycle. Pretend we take two cycles in
+;; the execute stage.
+(define_insn_reservation "9_alu_shift_reg_op" 2
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "alu_shift_reg"))
+ "e*2,m,w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Multiplication Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Multiplication instructions loop in the execute stage until the
+;; instruction has been passed through the multiplier array enough
+;; times. Multiply operations occur in both the execute and memory
+;; stages of the pipeline
+
+(define_insn_reservation "9_mult1" 3
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "smlalxy,mul,mla"))
+ "e*2,m,w")
+
+(define_insn_reservation "9_mult2" 4
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "muls,mlas"))
+ "e*3,m,w")
+
+(define_insn_reservation "9_mult3" 4
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "umull,umlal,smull,smlal"))
+ "e*3,m,w")
+
+(define_insn_reservation "9_mult4" 5
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "umulls,umlals,smulls,smlals"))
+ "e*4,m,w")
+
+(define_insn_reservation "9_mult5" 2
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "smulxy,smlaxy,smlawx"))
+ "e,m,w")
+
+(define_insn_reservation "9_mult6" 3
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "insn" "smlalxy"))
+ "e*2,m,w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Load/Store Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The models for load/store instructions do not accurately describe
+;; the difference between operations with a base register writeback
+;; (such as "ldm!"). These models assume that all memory references
+;; hit in dcache.
+
+;; Loads with a shifted offset take 3 cycles, and are (a) probably the
+;; most common and (b) the pessimistic assumption will lead to fewer stalls.
+(define_insn_reservation "9_load1_op" 3
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "load1,load_byte"))
+ "e*2,m,w")
+
+(define_insn_reservation "9_store1_op" 0
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "store1"))
+ "e,m,w")
+
+;; multiple word loads and stores
+(define_insn_reservation "9_load2_op" 3
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "load2"))
+ "e,m*2,w")
+
+(define_insn_reservation "9_load3_op" 4
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "load3"))
+ "e,m*3,w")
+
+(define_insn_reservation "9_load4_op" 5
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "load4"))
+ "e,m*4,w")
+
+(define_insn_reservation "9_store2_op" 0
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "store2"))
+ "e,m*2,w")
+
+(define_insn_reservation "9_store3_op" 0
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "store3"))
+ "e,m*3,w")
+
+(define_insn_reservation "9_store4_op" 0
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "store4"))
+ "e,m*4,w")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Branch and Call Instructions
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Branch instructions are difficult to model accurately. The ARM
+;; core can predict most branches. If the branch is predicted
+;; correctly, and predicted early enough, the branch can be completely
+;; eliminated from the instruction stream. Some branches can
+;; therefore appear to require zero cycles to execute. We assume that
+;; all branches are predicted correctly, and that the latency is
+;; therefore the minimum value.
+
+(define_insn_reservation "9_branch_op" 0
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "branch"))
+ "nothing")
+
+;; The latency for a call is not predictable. Therefore, we use 32 as
+;; roughly equivalent to positive infinity.
+
+(define_insn_reservation "9_call_op" 32
+ (and (eq_attr "tune" "arm926ejs")
+ (eq_attr "type" "call"))
+ "nothing")
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
new file mode 100644
index 00000000000..5ae6c41f75a
--- /dev/null
+++ b/gcc/config/arm/vfp.md
@@ -0,0 +1,744 @@
+;; ARM VFP coprocessor Machine Description
+;; Copyright (C) 2003 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA. */
+
+;; Additional register numbers
+(define_constants
+ [(VFPCC_REGNUM 95)]
+)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Pipeline description
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_automaton "vfp11")
+
+;; There are 3 pipelines in the VFP11 unit.
+;;
+;; - A 8-stage FMAC pipeline (7 execute + writeback) with forward from
+;; fourth stage for simple operations.
+;;
+;; - A 5-stage DS pipeline (4 execute + writeback) for divide/sqrt insns.
+;; These insns also uses first execute stage of FMAC pipeline.
+;;
+;; - A 4-stage LS pipeline (execute + 2 memory + writeback) with forward from
+;; second memory stage for loads.
+
+;; We do not model Write-After-Read hazards.
+;; We do not do write scheduling with the arm core, so it is only necessary
+;; to model the first stage of each pipeline
+;; ??? Need to model LS pipeline properly for load/store multiple?
+;; We do not model fmstat properly. This could be done by modeling pipelines
+;; properly and defining an absence set between a dummy fmstat unit and all
+;; other vfp units.
+
+(define_cpu_unit "fmac" "vfp11")
+
+(define_cpu_unit "ds" "vfp11")
+
+(define_cpu_unit "vfp_ls" "vfp11")
+
+;; The VFP "type" attributes differ from those used in the FPA model.
+;; ffarith Fast floating point insns, eg. abs, neg, cpy, cmp.
+;; farith Most arithmetic insns.
+;; fmul Double precision multiply.
+;; fdivs Single precision sqrt or division.
+;; fdivd Double precision sqrt or division.
+;; f_load Floating point load from memory.
+;; f_store Floating point store to memory.
+;; f_2_r Transfer vfp to arm reg.
+;; r_2_f Transfer arm to vfp reg.
+
+(define_insn_reservation "vfp_ffarith" 4
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "ffarith"))
+ "fmac")
+
+(define_insn_reservation "vfp_farith" 8
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "farith"))
+ "fmac")
+
+(define_insn_reservation "vfp_fmul" 9
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "fmul"))
+ "fmac*2")
+
+(define_insn_reservation "vfp_fdivs" 19
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "fdivs"))
+ "ds*15")
+
+(define_insn_reservation "vfp_fdivd" 33
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "fdivd"))
+ "fmac+ds*29")
+
+;; Moves to/from arm regs also use the load/store pipeline.
+(define_insn_reservation "vfp_fload" 4
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "f_load,r_2_f"))
+ "vfp_ls")
+
+(define_insn_reservation "vfp_fstore" 4
+ (and (eq_attr "fpu" "vfp")
+ (eq_attr "type" "f_load,f_2_r"))
+ "vfp_ls")
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Insn pattern
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; SImode moves
+;; ??? For now do not allow loading constants into vfp regs. This causes
+;; problems because small constants get converted into adds.
+(define_insn "*arm_movsi_vfp"
+ [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r ,m,!w,r,!w,!w, U")
+ (match_operand:SI 1 "general_operand" "rI,K,mi,r,r,!w,!w,Ui,!w"))]
+ "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT
+ && ( s_register_operand (operands[0], SImode)
+ || s_register_operand (operands[1], SImode))"
+ "@
+ mov%?\\t%0, %1
+ mvn%?\\t%0, #%B1
+ ldr%?\\t%0, %1
+ str%?\\t%1, %0
+ fmsr%?\\t%0, %1\\t%@ int
+ fmrs%?\\t%0, %1\\t%@ int
+ fcpys%?\\t%0, %1\\t%@ int
+ flds%?\\t%0, %1\\t%@ int
+ fsts%?\\t%1, %0\\t%@ int"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "*,*,load1,store1,r_2_f,f_2_r,ffarith,f_load,f_store")
+ (set_attr "pool_range" "*,*,4096,*,*,*,*,1020,*")
+ (set_attr "neg_pool_range" "*,*,4084,*,*,*,*,1008,*")]
+)
+
+
+;; DImode moves
+
+(define_insn "*arm_movdi_vfp"
+ [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,o<>,w,r,w,w ,U")
+ (match_operand:DI 1 "di_operand" "rIK,mi,r ,r,w,w,Ui,w"))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "*
+ switch (which_alternative)
+ {
+ case 0: case 1: case 2:
+ return (output_move_double (operands));
+ case 3:
+ return \"fmdrr%?\\t%P0, %1\\t%@ int\";
+ case 4:
+ return \"fmrrd%?\\t%0, %1\\t%@ int\";
+ case 5:
+ return \"fcpyd%?\\t%P0, %P1\\t%@ int\";
+ case 6:
+ return \"fldd%?\\t%P0, %1\\t%@ int\";
+ case 7:
+ return \"fstd%?\\t%P1, %0\\t%@ int\";
+ default:
+ abort ();
+ }
+ "
+ [(set_attr "type" "*,load2,store2,r_2_f,f_2_r,ffarith,f_load,f_store")
+ (set_attr "length" "8,8,8,4,4,4,4,4")
+ (set_attr "pool_range" "*,1020,*,*,*,*,1020,*")
+ (set_attr "neg_pool_range" "*,1008,*,*,*,*,1008,*")]
+)
+
+
+;; SFmode moves
+
+(define_insn "*movsf_vfp"
+ [(set (match_operand:SF 0 "nonimmediate_operand" "=w,r,w ,U,r ,m,w,r")
+ (match_operand:SF 1 "general_operand" " r,w,UE,w,mE,r,w,r"))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP
+ && ( s_register_operand (operands[0], SFmode)
+ || s_register_operand (operands[1], SFmode))"
+ "@
+ fmsr%?\\t%0, %1
+ fmrs%?\\t%0, %1
+ flds%?\\t%0, %1
+ fsts%?\\t%1, %0
+ ldr%?\\t%0, %1\\t%@ float
+ str%?\\t%1, %0\\t%@ float
+ fcpys%?\\t%0, %1
+ mov%?\\t%0, %1\\t%@ float"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "r_2_f,f_2_r,ffarith,*,f_load,f_store,load1,store1")
+ (set_attr "pool_range" "*,*,1020,*,4096,*,*,*")
+ (set_attr "neg_pool_range" "*,*,1008,*,4080,*,*,*")]
+)
+
+
+;; DFmode moves
+
+(define_insn "*movdf_vfp"
+ [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,r,r, m,w ,U,w,r")
+ (match_operand:DF 1 "soft_df_operand" " r,w,mF,r,UF,w,w,r"))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "*
+ {
+ switch (which_alternative)
+ {
+ case 0:
+ return \"fmdrr%?\\t%P0, %Q1, %R1\";
+ case 1:
+ return \"fmrrd%?\\t%Q0, %R0, %P1\";
+ case 2: case 3: case 7:
+ return output_move_double (operands);
+ case 4:
+ return \"fldd%?\\t%P0, %1\";
+ case 5:
+ return \"fstd%?\\t%P1, %0\";
+ case 6:
+ return \"fcpyd%?\\t%P0, %P1\";
+ default:
+ abort ();
+ }
+ }
+ "
+ [(set_attr "type" "r_2_f,f_2_r,ffarith,*,load2,store2,f_load,f_store")
+ (set_attr "length" "4,4,8,8,4,4,4,8")
+ (set_attr "pool_range" "*,*,1020,*,1020,*,*,*")
+ (set_attr "neg_pool_range" "*,*,1008,*,1008,*,*,*")]
+)
+
+
+;; Conditional move patterns
+
+(define_insn "*movsfcc_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r")
+ (if_then_else:SF
+ (match_operator 3 "arm_comparison_operator"
+ [(match_operand 4 "cc_register" "") (const_int 0)])
+ (match_operand:SF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w")
+ (match_operand:SF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcpys%D3\\t%0, %2
+ fcpys%d3\\t%0, %1
+ fcpys%D3\\t%0, %2\;fcpys%d3\\t%0, %1
+ fmsr%D3\\t%0, %2
+ fmsr%d3\\t%0, %1
+ fmsr%D3\\t%0, %2\;fmsr%d3\\t%0, %1
+ fmrs%D3\\t%0, %2
+ fmrs%d3\\t%0, %1
+ fmrs%D3\\t%0, %2\;fmrs%d3\\t%0, %1"
+ [(set_attr "conds" "use")
+ (set_attr "length" "4,4,8,4,4,8,4,4,8")
+ (set_attr "type" "ffarith,ffarith,ffarith,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")]
+)
+
+(define_insn "*movdfcc_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r")
+ (if_then_else:DF
+ (match_operator 3 "arm_comparison_operator"
+ [(match_operand 4 "cc_register" "") (const_int 0)])
+ (match_operand:DF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w")
+ (match_operand:DF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcpyd%D3\\t%P0, %P2
+ fcpyd%d3\\t%P0, %P1
+ fcpyd%D3\\t%P0, %P2\;fcpyd%d3\\t%P0, %P1
+ fmdrr%D3\\t%P0, %Q2, %R2
+ fmdrr%d3\\t%P0, %Q1, %R1
+ fmdrr%D3\\t%P0, %Q2, %R2\;fmdrr%d3\\t%P0, %Q1, %R1
+ fmrrd%D3\\t%Q0, %R0, %P2
+ fmrrd%d3\\t%Q0, %R0, %P1
+ fmrrd%D3\\t%Q0, %R0, %P2\;fmrrd%d3\\t%Q0, %R0, %P1"
+ [(set_attr "conds" "use")
+ (set_attr "length" "4,4,8,4,4,8,4,4,8")
+ (set_attr "type" "ffarith,ffarith,ffarith,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")]
+)
+
+
+;; Sign manipulation functions
+
+(define_insn "*abssf2_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (abs:SF (match_operand:SF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fabss%?\\t%0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*absdf2_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (abs:DF (match_operand:DF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fabsd%?\\t%P0, %P1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*negsf2_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "+w")
+ (neg:SF (match_operand:SF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnegs%?\\t%0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*negdf2_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "+w")
+ (neg:DF (match_operand:DF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnegd%?\\t%P0, %P1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+
+;; Arithmetic insns
+
+(define_insn "*addsf3_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (plus:SF (match_operand:SF 1 "s_register_operand" "w")
+ (match_operand:SF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fadds%?\\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*adddf3_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (plus:DF (match_operand:DF 1 "s_register_operand" "w")
+ (match_operand:DF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "faddd%?\\t%P0, %P1, %P2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+
+(define_insn "*subsf3_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (minus:SF (match_operand:SF 1 "s_register_operand" "w")
+ (match_operand:SF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsubs%?\\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*subdf3_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (minus:DF (match_operand:DF 1 "s_register_operand" "w")
+ (match_operand:DF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsubd%?\\t%P0, %P1, %P2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+
+;; Division insns
+
+(define_insn "*divsf3_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "+w")
+ (div:SF (match_operand:SF 1 "s_register_operand" "w")
+ (match_operand:SF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fdivs%?\\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fdivs")]
+)
+
+(define_insn "*divdf3_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "+w")
+ (div:DF (match_operand:DF 1 "s_register_operand" "w")
+ (match_operand:DF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fdivd%?\\t%P0, %P1, %P2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fdivd")]
+)
+
+
+;; Multiplication insns
+
+(define_insn "*mulsf3_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "+w")
+ (mult:SF (match_operand:SF 1 "s_register_operand" "w")
+ (match_operand:SF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmuls%?\\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*muldf3_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "+w")
+ (mult:DF (match_operand:DF 1 "s_register_operand" "w")
+ (match_operand:DF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmuld%?\\t%P0, %P1, %P2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+
+(define_insn "*mulsf3negsf_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "+w")
+ (mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "w"))
+ (match_operand:SF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmuls%?\\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*muldf3negdf_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "+w")
+ (mult:DF (neg:DF (match_operand:DF 1 "s_register_operand" "w"))
+ (match_operand:DF 2 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmuld%?\\t%P0, %P1, %P2"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+
+;; Multiply-accumulate insns
+
+;; 0 = 1 * 2 + 0
+(define_insn "*mulsf3addsf_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "w")
+ (match_operand:SF 3 "s_register_operand" "w"))
+ (match_operand:SF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmacs%?\\t%0, %2, %3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*muldf3adddf_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (plus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w")
+ (match_operand:DF 3 "s_register_operand" "w"))
+ (match_operand:DF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmacd%?\\t%P0, %P2, %P3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+;; 0 = 1 * 2 - 0
+(define_insn "*mulsf3subsf_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "w")
+ (match_operand:SF 3 "s_register_operand" "w"))
+ (match_operand:SF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmscs%?\\t%0, %2, %3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*muldf3subdf_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (minus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w")
+ (match_operand:DF 3 "s_register_operand" "w"))
+ (match_operand:DF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmscd%?\\t%P0, %P2, %P3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+;; 0 = -(1 * 2) + 0
+(define_insn "*mulsf3negsfaddsf_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (minus:SF (match_operand:SF 1 "s_register_operand" "0")
+ (mult:SF (match_operand:SF 2 "s_register_operand" "w")
+ (match_operand:SF 3 "s_register_operand" "w"))))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmacs%?\\t%0, %2, %3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*fmuldf3negdfadddf_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (minus:DF (match_operand:DF 1 "s_register_operand" "0")
+ (mult:DF (match_operand:DF 2 "s_register_operand" "w")
+ (match_operand:DF 3 "s_register_operand" "w"))))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmacd%?\\t%P0, %P2, %P3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+
+;; 0 = -(1 * 2) - 0
+(define_insn "*mulsf3negsfsubsf_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (minus:SF (mult:SF
+ (neg:SF (match_operand:SF 2 "s_register_operand" "w"))
+ (match_operand:SF 3 "s_register_operand" "w"))
+ (match_operand:SF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmscs%?\\t%0, %2, %3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*muldf3negdfsubdf_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (minus:DF (mult:DF
+ (neg:DF (match_operand:DF 2 "s_register_operand" "w"))
+ (match_operand:DF 3 "s_register_operand" "w"))
+ (match_operand:DF 1 "s_register_operand" "0")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fnmscd%?\\t%P0, %P2, %P3"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fmul")]
+)
+
+
+;; Conversion routines
+
+(define_insn "*extendsfdf2_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (float_extend:DF (match_operand:SF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fcvtds%?\\t%P0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*truncdfsf2_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (float_truncate:SF (match_operand:DF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fcvtsd%?\\t%0, %P1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*truncsisf2_vfp"
+ [(set (match_operand:SI 0 "s_register_operand" "=w")
+ (fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "w"))))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "ftosizs%?\\t%0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*truncsidf2_vfp"
+ [(set (match_operand:SI 0 "s_register_operand" "=w")
+ (fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "w"))))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "ftosizd%?\\t%0, %P1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*floatsisf2_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (float:SF (match_operand:SI 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsitos%?\\t%0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+(define_insn "*floatsidf2_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (float:DF (match_operand:SI 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsitod%?\\t%P0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "farith")]
+)
+
+
+;; Sqrt insns.
+
+(define_insn "*sqrtsf2_vfp"
+ [(set (match_operand:SF 0 "s_register_operand" "=w")
+ (sqrt:SF (match_operand:SF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsqrts%?\\t%0, %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fdivs")]
+)
+
+(define_insn "*sqrtdf2_vfp"
+ [(set (match_operand:DF 0 "s_register_operand" "=w")
+ (sqrt:DF (match_operand:DF 1 "s_register_operand" "w")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fsqrtd%?\\t%P0, %P1"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "fdivd")]
+)
+
+
+;; Patterns to split/copy vfp condition flags.
+
+(define_insn "*movcc_vfp"
+ [(set (reg CC_REGNUM)
+ (reg VFPCC_REGNUM))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "fmstat%?"
+ [(set_attr "conds" "set")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn_and_split "*cmpsf_split_vfp"
+ [(set (reg:CCFP CC_REGNUM)
+ (compare:CCFP (match_operand:SF 0 "s_register_operand" "w")
+ (match_operand:SF 1 "vfp_compare_operand" "wG")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "#"
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ [(set (reg:CCFP VFPCC_REGNUM)
+ (compare:CCFP (match_dup 0)
+ (match_dup 1)))
+ (set (reg:CCFP CC_REGNUM)
+ (reg:CCFP VFPCC_REGNUM))]
+ ""
+)
+
+(define_insn_and_split "*cmpsf_trap_split_vfp"
+ [(set (reg:CCFPE CC_REGNUM)
+ (compare:CCFPE (match_operand:SF 0 "s_register_operand" "w")
+ (match_operand:SF 1 "vfp_compare_operand" "wG")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "#"
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ [(set (reg:CCFPE VFPCC_REGNUM)
+ (compare:CCFPE (match_dup 0)
+ (match_dup 1)))
+ (set (reg:CCFPE CC_REGNUM)
+ (reg:CCFPE VFPCC_REGNUM))]
+ ""
+)
+
+(define_insn_and_split "*cmpdf_split_vfp"
+ [(set (reg:CCFP CC_REGNUM)
+ (compare:CCFP (match_operand:DF 0 "s_register_operand" "w")
+ (match_operand:DF 1 "vfp_compare_operand" "wG")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "#"
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ [(set (reg:CCFP VFPCC_REGNUM)
+ (compare:CCFP (match_dup 0)
+ (match_dup 1)))
+ (set (reg:CCFP CC_REGNUM)
+ (reg:CCFPE VFPCC_REGNUM))]
+ ""
+)
+
+(define_insn_and_split "*cmpdf_trap_split_vfp"
+ [(set (reg:CCFPE CC_REGNUM)
+ (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w")
+ (match_operand:DF 1 "vfp_compare_operand" "wG")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "#"
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ [(set (reg:CCFPE VFPCC_REGNUM)
+ (compare:CCFPE (match_dup 0)
+ (match_dup 1)))
+ (set (reg:CCFPE CC_REGNUM)
+ (reg:CCFPE VFPCC_REGNUM))]
+ ""
+)
+
+
+;; Comparison patterns
+
+(define_insn "*cmpsf_vfp"
+ [(set (reg:CCFP VFPCC_REGNUM)
+ (compare:CCFP (match_operand:SF 0 "s_register_operand" "w,w")
+ (match_operand:SF 1 "vfp_compare_operand" "w,G")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcmps%?\\t%0, %1
+ fcmpzs%?\\t%0"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*cmpsf_trap_vfp"
+ [(set (reg:CCFPE VFPCC_REGNUM)
+ (compare:CCFPE (match_operand:SF 0 "s_register_operand" "w,w")
+ (match_operand:SF 1 "vfp_compare_operand" "w,G")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcmpes%?\\t%0, %1
+ fcmpezs%?\\t%0"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*cmpdf_vfp"
+ [(set (reg:CCFP VFPCC_REGNUM)
+ (compare:CCFP (match_operand:DF 0 "s_register_operand" "w,w")
+ (match_operand:DF 1 "vfp_compare_operand" "w,G")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcmpd%?\\t%P0, %P1
+ fcmpzd%?\\t%P0"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+(define_insn "*cmpdf_trap_vfp"
+ [(set (reg:CCFPE VFPCC_REGNUM)
+ (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w,w")
+ (match_operand:DF 1 "vfp_compare_operand" "w,G")))]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "@
+ fcmped%?\\t%P0, %P1
+ fcmpezd%?\\t%P0"
+ [(set_attr "predicable" "yes")
+ (set_attr "type" "ffarith")]
+)
+
+
+;; Store multiple insn used in function prologue.
+
+(define_insn "*push_multi_vfp"
+ [(match_parallel 2 "multi_register_push"
+ [(set (match_operand:BLK 0 "memory_operand" "=m")
+ (unspec:BLK [(match_operand:DF 1 "s_register_operand" "w")]
+ UNSPEC_PUSH_MULT))])]
+ "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP"
+ "* return vfp_output_fstmx (operands);"
+ [(set_attr "type" "f_store")]
+)
+
+
+;; Unimplemented insns:
+;; fldm*
+;; fstm*
+;; fmdhr et al (VFPv1)
+;; Support for xD (single precision only) variants.
+;; fmrrs, fmsrr
+;; fuito*
+;; ftoui*