diff options
author | no-author <no-author@gcc.gnu.org> | 2004-02-12 20:15:57 +0000 |
---|---|---|
committer | no-author <no-author@gcc.gnu.org> | 2004-02-12 20:15:57 +0000 |
commit | 5ce35163972a168c33c34b1ed753d16747f73ef3 (patch) | |
tree | 03bdff41b9946151953d965b51d3352fd8baf64d /gcc/config/arm | |
parent | e8461705deb66941c2b0ae4ae0ed44637b44c345 (diff) |
This commit was manufactured by cvs2svn to create branch
'tree-ssa-20020619-branch'.
git-svn-id: https://gcc.gnu.org/svn/gcc/branches/tree-ssa-20020619-branch@77725 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/arm')
-rw-r--r-- | gcc/config/arm/arm-cores.def | 87 | ||||
-rw-r--r-- | gcc/config/arm/arm-generic.md | 152 | ||||
-rw-r--r-- | gcc/config/arm/arm1026ejs.md | 241 | ||||
-rw-r--r-- | gcc/config/arm/arm1136jfs.md | 377 | ||||
-rw-r--r-- | gcc/config/arm/arm926ejs.md | 188 | ||||
-rw-r--r-- | gcc/config/arm/vfp.md | 744 |
6 files changed, 1789 insertions, 0 deletions
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def new file mode 100644 index 00000000000..774ba6f10f2 --- /dev/null +++ b/gcc/config/arm/arm-cores.def @@ -0,0 +1,87 @@ +/* ARM CPU Cores + Copyright (C) 2003 Free Software Foundation, Inc. + Written by CodeSourcery, LLC + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to the Free + Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +/* Before using #include to read this file, define a macro: + + ARM_CORE(CORE_NAME, FLAGS) + + The CORE_NAME is the name of the core, represented as an identifier + rather than a string constant. The FLAGS are the bitwise-or of the + traits that apply to that core. + + If you update this table, you must update the "tune" attribute in + arm.md. */ + +ARM_CORE(arm2, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE(arm250, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE(arm3, FL_CO_PROC | FL_MODE26, slowmul) +ARM_CORE(arm6, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm60, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm600, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm610, FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm620, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm7, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +/* arm7m doesn't exist on its own, but only with D, (and I), but + those don't alter the code, so arm7m is sometimes used. */ +ARM_CORE(arm7m, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul) +ARM_CORE(arm7d, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm7dm, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul) +ARM_CORE(arm7di, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm7dmi, FL_CO_PROC | FL_MODE26 | FL_MODE32 | FL_ARCH3M, fastmul) +ARM_CORE(arm70, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm700, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm700i, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm710, FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm720, FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm710c, FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm7100, FL_MODE26 | FL_MODE32, slowmul) +ARM_CORE(arm7500, FL_MODE26 | FL_MODE32, slowmul) +/* Doesn't have an external co-proc, but does have embedded fpa. */ +ARM_CORE(arm7500fe, FL_CO_PROC | FL_MODE26 | FL_MODE32, slowmul) +/* V4 Architecture Processors */ +ARM_CORE(arm7tdmi, FL_CO_PROC | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul) +ARM_CORE(arm710t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul) +ARM_CORE(arm720t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul) +ARM_CORE(arm740t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB, fastmul) +ARM_CORE(arm8, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul) +ARM_CORE(arm810, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul) +ARM_CORE(arm9, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul) +ARM_CORE(arm920, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, fastmul) +ARM_CORE(arm920t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul) +ARM_CORE(arm940t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul) +ARM_CORE(arm9tdmi, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED, fastmul) +ARM_CORE(arm9e, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED, 9e) + +ARM_CORE(ep9312, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_CIRRUS, fastmul) +ARM_CORE(strongarm, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE(strongarm110, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE(strongarm1100, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul) +ARM_CORE(strongarm1110, FL_MODE26 | FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_LDSCHED | FL_STRONG, fastmul) +/* V5 Architecture Processors */ +ARM_CORE(arm10tdmi, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_ARCH5, fastmul) +ARM_CORE(arm1020t, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_ARCH5, fastmul) +ARM_CORE(arm926ejs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E, 9e) +ARM_CORE(arm1026ejs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E, 9e) +ARM_CORE(xscale, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_STRONG | FL_ARCH5 | FL_ARCH5E | FL_XSCALE, xscale) +ARM_CORE(iwmmxt, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_LDSCHED | FL_STRONG | FL_ARCH5 | FL_ARCH5E | FL_XSCALE | FL_IWMMXT, xscale) +/* V6 Architecture Processors */ +ARM_CORE(arm1136js, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E | FL_ARCH6, 9e) +ARM_CORE(arm1136jfs, FL_MODE32 | FL_ARCH3M | FL_ARCH4 | FL_THUMB | FL_ARCH5 | FL_ARCH5E | FL_ARCH6 | FL_VFPV2, 9e) diff --git a/gcc/config/arm/arm-generic.md b/gcc/config/arm/arm-generic.md new file mode 100644 index 00000000000..ec2df47b465 --- /dev/null +++ b/gcc/config/arm/arm-generic.md @@ -0,0 +1,152 @@ +;; Generic ARM Pipeline Description +;; Copyright (C) 2003 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to the Free +;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. */ + +(define_automaton "arm") + +;; Write buffer +; +; Strictly, we should model a 4-deep write buffer for ARM7xx based chips +; +; The write buffer on some of the arm6 processors is hard to model exactly. +; There is room in the buffer for up to two addresses and up to eight words +; of memory, but the two needn't be split evenly. When writing the two +; addresses are fully pipelined. However, a read from memory that is not +; currently in the cache will block until the writes have completed. +; It is normally the case that FCLK and MCLK will be in the ratio 2:1, so +; writes will take 2 FCLK cycles per word, if FCLK and MCLK are asynchronous +; (they aren't allowed to be at present) then there is a startup cost of 1MCLK +; cycle to add as well. +(define_cpu_unit "write_buf" "arm") + +;; Write blockage unit +; +; The write_blockage unit models (partially), the fact that reads will stall +; until the write buffer empties. +; The f_mem_r and r_mem_f could also block, but they are to the stack, +; so we don't model them here +(define_cpu_unit "write_blockage" "arm") + +;; Core +; +(define_cpu_unit "core" "arm") + +(define_insn_reservation "r_mem_f_wbuf" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "r_mem_f"))) + "core+write_buf*3") + +(define_insn_reservation "store_wbuf" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store1"))) + "core+write_buf*3+write_blockage*5") + +(define_insn_reservation "store2_wbuf" 7 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store2"))) + "core+write_buf*4+write_blockage*7") + +(define_insn_reservation "store3_wbuf" 9 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store3"))) + "core+write_buf*5+write_blockage*9") + +(define_insn_reservation "store4_wbuf" 11 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "yes") + (eq_attr "type" "store4"))) + "core+write_buf*6+write_blockage*11") + +(define_insn_reservation "store2" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store2"))) + "core*3") + +(define_insn_reservation "store3" 4 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store3"))) + "core*4") + +(define_insn_reservation "store4" 5 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "model_wbuf" "no") + (eq_attr "type" "store4"))) + "core*5") + +(define_insn_reservation "store_ldsched" 1 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (eq_attr "type" "store1"))) + "core") + +(define_insn_reservation "load_ldsched_xscale" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "type" "load_byte,load1") + (eq_attr "is_xscale" "yes")))) + "core") + +(define_insn_reservation "load_ldsched" 2 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "type" "load_byte,load1") + (eq_attr "is_xscale" "no")))) + "core") + +(define_insn_reservation "load_or_store" 2 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "!yes") + (eq_attr "type" "load_byte,load1,load2,load3,load4,store1"))) + "core*2") + +(define_insn_reservation "mult" 16 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "no") (eq_attr "type" "mult"))) + "core*16") + +(define_insn_reservation "mult_ldsched_strongarm" 3 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "is_strongarm" "yes") + (eq_attr "type" "mult")))) + "core*2") + +(define_insn_reservation "mult_ldsched" 4 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "ldsched" "yes") + (and (eq_attr "is_strongarm" "no") + (eq_attr "type" "mult")))) + "core*4") + +(define_insn_reservation "multi_cycle" 32 + (and (eq_attr "generic_sched" "yes") + (and (eq_attr "core_cycles" "multi") + (eq_attr "type" "!mult,load_byte,load1,load2,load3,load4,store1,store2,store3,store4"))) + "core*32") + +(define_insn_reservation "single_cycle" 1 + (and (eq_attr "generic_sched" "yes") + (eq_attr "core_cycles" "single")) + "core") diff --git a/gcc/config/arm/arm1026ejs.md b/gcc/config/arm/arm1026ejs.md new file mode 100644 index 00000000000..77f8fde2ccf --- /dev/null +++ b/gcc/config/arm/arm1026ejs.md @@ -0,0 +1,241 @@ +;; ARM 1026EJ-S Pipeline Description +;; Copyright (C) 2003 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to the Free +;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. */ + +;; These descriptions are based on the information contained in the +;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 1026EJ-S core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm1026ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are two pipelines: +;; +;; - An Arithmetic Logic Unit (ALU) pipeline. +;; +;; The ALU pipeline has fetch, issue, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. +;; +;; - A Load-Store Unit (LSU) pipeline. +;; +;; The LSU pipeline has decode, execute, memory, and write stages. +;; We only model the execute, memory and write stages. + +(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") +(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "alu_op" 1 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu")) + "a_e,a_m,a_w") + +;; ALU operations with a shift-by-constant operand +(define_insn_reservation "alu_shift_op" 1 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu_shift")) + "a_e,a_m,a_w") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +(define_insn_reservation "alu_shift_reg_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "alu_shift_reg")) + "a_e*2,a_m,a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. + +;; The result of the "smul" and "smulw" instructions is not available +;; until after the memory stage. +(define_insn_reservation "mult1" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smulxy,smulwy")) + "a_e,a_m,a_w") + +;; The "smlaxy" and "smlawx" instructions require two iterations through +;; the execute stage; the result is available immediately following +;; the execute stage. +(define_insn_reservation "mult2" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smlaxy,smlalxy,smlawx")) + "a_e*2,a_m,a_w") + +;; The "smlalxy", "mul", and "mla" instructions require two iterations +;; through the execute stage; the result is not available until after +;; the memory stage. +(define_insn_reservation "mult3" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "smlalxy,mul,mla")) + "a_e*2,a_m,a_w") + +;; The "muls" and "mlas" instructions loop in the execute stage for +;; four iterations in order to set the flags. The value result is +;; available after three iterations. +(define_insn_reservation "mult4" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "muls,mlas")) + "a_e*4,a_m,a_w") + +;; Long multiply instructions that produce two registers of +;; output (such as umull) make their results available in two cycles; +;; the least significant word is available before the most significant +;; word. That fact is not modeled; instead, the instructions are +;; described.as if the entire result was available at the end of the +;; cycle in which both words are available. + +;; The "umull", "umlal", "smull", and "smlal" instructions all take +;; three iterations through the execute cycle, and make their results +;; available after the memory cycle. +(define_insn_reservation "mult5" 4 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "umull,umlal,smull,smlal")) + "a_e*3,a_m,a_w") + +;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in +;; the execute stage for five iterations in order to set the flags. +;; The value result is available after four iterations. +(define_insn_reservation "mult6" 4 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "insn" "umulls,umlals,smulls,smlals")) + "a_e*5,a_m,a_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; LSU instructions require six cycles to execute. They use the ALU +;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles +;; three through six. +;; Loads and stores which use a scaled register offset or scaled +;; register pre-indexed addressing mode take three cycles EXCEPT for +;; those that are base + offset with LSL of 0 or 2, or base - offset +;; with LSL of zero. The remainder take 1 cycle to execute. +;; For 4byte loads there is a bypass from the load stage + +(define_insn_reservation "load1_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load_byte,load1")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "store1_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store1")) + "a_e+l_e,l_m,a_w+l_w") + +;; A load's result can be stored by an immediately following store +(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") + +;; On a LDM/STM operation, the LSU pipeline iterates until all of the +;; registers have been processed. +;; +;; The time it takes to load the data depends on whether or not the +;; base address is 64-bit aligned; if it is not, an additional cycle +;; is required. This model assumes that the address is always 64-bit +;; aligned. Because the processor can load two registers per cycle, +;; that assumption means that we use the same instruction reservations +;; for loading 2k and 2k - 1 registers. +;; +;; The ALU pipeline is stalled until the completion of the last memory +;; stage in the LSU pipeline. That is modeled by keeping the ALU +;; execute stage busy until that point. +;; +;; As with ALU operations, if one of the destination registers is the +;; PC, there are additional stalls; that is not modeled. + +(define_insn_reservation "load2_op" 2 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load2")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "store2_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store2")) + "a_e+l_e,l_m,a_w+l_w") + +(define_insn_reservation "load34_op" 3 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "load3,load4")) + "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") + +(define_insn_reservation "store34_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "store3,store4")) + "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "branch_op" 0 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "branch")) + "nothing") + +;; The latency for a call is not predictable. Therefore, we use 32 as +;; roughly equivalent to positive infinity. + +(define_insn_reservation "call_op" 32 + (and (eq_attr "tune" "arm1026ejs") + (eq_attr "type" "call")) + "nothing") diff --git a/gcc/config/arm/arm1136jfs.md b/gcc/config/arm/arm1136jfs.md new file mode 100644 index 00000000000..2c0638c0524 --- /dev/null +++ b/gcc/config/arm/arm1136jfs.md @@ -0,0 +1,377 @@ +;; ARM 1136J[F]-S Pipeline Description +;; Copyright (C) 2003 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to the Free +;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. */ + +;; These descriptions are based on the information contained in the +;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 1136J-S and 1136JF-S cores. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm1136jfs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There are three distinct pipelines (page 1-26 and following): +;; +;; - A 4-stage decode pipeline, shared by all three. It has fetch (1), +;; fetch (2), decode, and issue stages. Since this is always involved, +;; we do not model it in the scheduler. +;; +;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations), +;; and saturation stages. The fourth stage is writeback; see below. +;; +;; - A 4-stage multiply-accumulate pipeline. It has three stages, called +;; MAC1 through MAC3, and a fourth writeback stage. +;; +;; The 4th-stage writeback is shared between the ALU and MAC pipelines, +;; which operate in lockstep. Results from either pipeline will be +;; moved into the writeback stage. Because the two pipelines operate +;; in lockstep, we schedule them as a single "execute" pipeline. +;; +;; - A 4-stage LSU pipeline. It has address generation, data cache (1), +;; data cache (2), and writeback stages. (Note that this pipeline, +;; including the writeback stage, is independent from the ALU & LSU pipes.) + +(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC +; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3 +(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require eight cycles to execute, and use the ALU +;; pipeline in each of the eight stages. The results are available +;; after the alu stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modelled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "11_alu_op" 2 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu")) + "e_1,e_2,e_3,e_wb") + +;; ALU operations with a shift-by-constant operand +(define_insn_reservation "11_alu_shift_op" 2 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu_shift")) + "e_1,e_2,e_3,e_wb") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the shift stage. +(define_insn_reservation "11_alu_shift_reg_op" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "alu_shift_reg")) + "e_1*2,e_2,e_3,e_wb") + +;; alu_ops can start sooner, if there is no shifter dependency +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_op") +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_op") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the first two execute stages until +;; the instruction has been passed through the multiplier array enough +;; times. + +;; Multiply and multiply-accumulate results are available after four stages. +(define_insn_reservation "11_mult1" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "mul,mla")) + "e_1*2,e_2,e_3,e_wb") + +;; The *S variants set the condition flags, which requires three more cycles. +(define_insn_reservation "11_mult2" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "muls,mlas")) + "e_1*2,e_2,e_3,e_wb") + +(define_bypass 3 "11_mult1,11_mult2" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_op") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "11_mult1,11_mult2" + "11_store1" + "arm_no_early_store_addr_dep") + +;; Signed and unsigned multiply long results are available across two cycles; +;; the less significant word is available one cycle before the more significant +;; word. Here we conservatively wait until both are available, which is +;; after three iterations and the memory cycle. The same is also true of +;; the two multiply-accumulate instructions. +(define_insn_reservation "11_mult3" 5 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smull,umull,smlal,umlal")) + "e_1*3,e_2,e_3,e_wb*2") + +;; The *S variants set the condition flags, which requires three more cycles. +(define_insn_reservation "11_mult4" 5 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smulls,umulls,smlals,umlals")) + "e_1*3,e_2,e_3,e_wb*2") + +(define_bypass 4 "11_mult3,11_mult4" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_op") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 4 "11_mult3,11_mult4" + "11_store1" + "arm_no_early_store_addr_dep") + +;; Various 16x16->32 multiplies and multiply-accumulates, using combinations +;; of high and low halves of the argument registers. They take a single +;; pass through the pipeline and make the result available after three +;; cycles. +(define_insn_reservation "11_mult5" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx")) + "e_1,e_2,e_3,e_wb") + +(define_bypass 2 "11_mult5" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 2 "11_mult5" + "11_alu_op") +(define_bypass 2 "11_mult5" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_mult5" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 2 "11_mult5" + "11_store1" + "arm_no_early_store_addr_dep") + +;; The same idea, then the 32-bit result is added to a 64-bit quantity. +(define_insn_reservation "11_mult6" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smlalxy")) + "e_1*2,e_2,e_3,e_wb*2") + +;; Signed 32x32 multiply, then the most significant 32 bits are extracted +;; and are available after the memory stage. +(define_insn_reservation "11_mult7" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "insn" "smmul,smmulr")) + "e_1*2,e_2,e_3,e_wb") + +(define_bypass 3 "11_mult6,11_mult7" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_op") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") +(define_bypass 3 "11_mult6,11_mult7" + "11_store1" + "arm_no_early_store_addr_dep") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; These vary greatly depending on their arguments and the results of +;; stat prediction. Cycle count ranges from zero (unconditional branch, +;; folded dynamic prediction) to seven (incorrect predictions, etc). We +;; assume an optimal case for now, because the cost of a cache miss +;; overwhelms the cost of everything else anyhow. + +(define_insn_reservation "11_branches" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "branch")) + "nothing") + +;; Call latencies are not predictable. A semi-arbitrary very large +;; number is used as "positive infinity" so that everything should be +;; finished by the time of return. +(define_insn_reservation "11_call" 32 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "call")) + "nothing") + +;; Branches are predicted. A correctly predicted branch will be no +;; cost, but we're conservative here, and use the timings a +;; late-register would give us. +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_branches") +(define_bypass 2 "11_alu_shift_reg_op" + "11_branches") +(define_bypass 2 "11_load1,11_load2" + "11_branches") +(define_bypass 3 "11_load34" + "11_branches") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback. +;; These models assume that all memory references hit in dcache. Also, +;; if the PC is one of the registers involved, there are additional stalls +;; not modelled here. Addressing modes are also not modelled. + +(define_insn_reservation "11_load1" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load1")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load byte results are not available until the writeback stage, where +;; the correct byte is extracted. + +(define_insn_reservation "11_loadb" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load_byte")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +(define_insn_reservation "11_store1" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store1")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load/store double words into adjacent registers. The timing and +;; latencies are different depending on whether the address is 64-bit +;; aligned. This model assumes that it is. +(define_insn_reservation "11_load2" 3 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load2")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +(define_insn_reservation "11_store2" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store2")) + "l_a+e_1,l_dc1,l_dc2,l_wb") + +;; Load/store multiple registers. Two registers are stored per cycle. +;; Actual timing depends on how many registers are affected, so we +;; optimistically schedule a low latency. +(define_insn_reservation "11_load34" 4 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "load3,load4")) + "l_a+e_1,l_dc1*2,l_dc2,l_wb") + +(define_insn_reservation "11_store34" 0 + (and (eq_attr "tune" "arm1136js,arm1136jfs") + (eq_attr "type" "store3,store4")) + "l_a+e_1,l_dc1*2,l_dc2,l_wb") + +;; A store can start immediately after an alu op, if that alu op does +;; not provide part of the address to access. +(define_bypass 1 "11_alu_op,11_alu_shift_op" + "11_store1" + "arm_no_early_store_addr_dep") +(define_bypass 2 "11_alu_shift_reg_op" + "11_store1" + "arm_no_early_store_addr_dep") + +;; An alu op can start sooner after a load, if that alu op does not +;; have an early register dependency on the load +(define_bypass 2 "11_load1" + "11_alu_op") +(define_bypass 2 "11_load1" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 2 "11_load1" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +(define_bypass 3 "11_loadb" + "11_alu_op") +(define_bypass 3 "11_loadb" + "11_alu_shift_op" + "arm_no_early_alu_shift_value_dep") +(define_bypass 3 "11_loadb" + "11_alu_shift_reg_op" + "arm_no_early_alu_shift_dep") + +;; A mul op can start sooner after a load, if that mul op does not +;; have an early multiply dependency +(define_bypass 2 "11_load1" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_load34" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") +(define_bypass 3 "11_loadb" + "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" + "arm_no_early_mul_dep") + +;; A store can start sooner after a load, if that load does not +;; produce part of the address to access +(define_bypass 2 "11_load1" + "11_store1" + "arm_no_early_store_addr_dep") +(define_bypass 3 "11_loadb" + "11_store1" + "arm_no_early_store_addr_dep") diff --git a/gcc/config/arm/arm926ejs.md b/gcc/config/arm/arm926ejs.md new file mode 100644 index 00000000000..258495b7f06 --- /dev/null +++ b/gcc/config/arm/arm926ejs.md @@ -0,0 +1,188 @@ +;; ARM 926EJ-S Pipeline Description +;; Copyright (C) 2003 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to the Free +;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. */ + +;; These descriptions are based on the information contained in the +;; ARM926EJ-S Technical Reference Manual, Copyright (c) 2002 ARM +;; Limited. +;; + +;; This automaton provides a pipeline description for the ARM +;; 926EJ-S core. +;; +;; The model given here assumes that the condition for all conditional +;; instructions is "true", i.e., that all of the instructions are +;; actually executed. + +(define_automaton "arm926ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipelines +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; There is a single pipeline +;; +;; The ALU pipeline has fetch, decode, execute, memory, and +;; write stages. We only need to model the execute, memory and write +;; stages. + +(define_cpu_unit "e,m,w" "arm926ejs") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ALU Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; ALU instructions require three cycles to execute, and use the ALU +;; pipeline in each of the three stages. The results are available +;; after the execute stage stage has finished. +;; +;; If the destination register is the PC, the pipelines are stalled +;; for several cycles. That case is not modeled here. + +;; ALU operations with no shifted operand +(define_insn_reservation "9_alu_op" 1 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "alu,alu_shift")) + "e,m,w") + +;; ALU operations with a shift-by-register operand +;; These really stall in the decoder, in order to read +;; the shift value in a second cycle. Pretend we take two cycles in +;; the execute stage. +(define_insn_reservation "9_alu_shift_reg_op" 2 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "alu_shift_reg")) + "e*2,m,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Multiplication Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Multiplication instructions loop in the execute stage until the +;; instruction has been passed through the multiplier array enough +;; times. Multiply operations occur in both the execute and memory +;; stages of the pipeline + +(define_insn_reservation "9_mult1" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smlalxy,mul,mla")) + "e*2,m,w") + +(define_insn_reservation "9_mult2" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "muls,mlas")) + "e*3,m,w") + +(define_insn_reservation "9_mult3" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "umull,umlal,smull,smlal")) + "e*3,m,w") + +(define_insn_reservation "9_mult4" 5 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "umulls,umlals,smulls,smlals")) + "e*4,m,w") + +(define_insn_reservation "9_mult5" 2 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smulxy,smlaxy,smlawx")) + "e,m,w") + +(define_insn_reservation "9_mult6" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "insn" "smlalxy")) + "e*2,m,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Load/Store Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The models for load/store instructions do not accurately describe +;; the difference between operations with a base register writeback +;; (such as "ldm!"). These models assume that all memory references +;; hit in dcache. + +;; Loads with a shifted offset take 3 cycles, and are (a) probably the +;; most common and (b) the pessimistic assumption will lead to fewer stalls. +(define_insn_reservation "9_load1_op" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load1,load_byte")) + "e*2,m,w") + +(define_insn_reservation "9_store1_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store1")) + "e,m,w") + +;; multiple word loads and stores +(define_insn_reservation "9_load2_op" 3 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load2")) + "e,m*2,w") + +(define_insn_reservation "9_load3_op" 4 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load3")) + "e,m*3,w") + +(define_insn_reservation "9_load4_op" 5 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "load4")) + "e,m*4,w") + +(define_insn_reservation "9_store2_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store2")) + "e,m*2,w") + +(define_insn_reservation "9_store3_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store3")) + "e,m*3,w") + +(define_insn_reservation "9_store4_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "store4")) + "e,m*4,w") + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Branch and Call Instructions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Branch instructions are difficult to model accurately. The ARM +;; core can predict most branches. If the branch is predicted +;; correctly, and predicted early enough, the branch can be completely +;; eliminated from the instruction stream. Some branches can +;; therefore appear to require zero cycles to execute. We assume that +;; all branches are predicted correctly, and that the latency is +;; therefore the minimum value. + +(define_insn_reservation "9_branch_op" 0 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "branch")) + "nothing") + +;; The latency for a call is not predictable. Therefore, we use 32 as +;; roughly equivalent to positive infinity. + +(define_insn_reservation "9_call_op" 32 + (and (eq_attr "tune" "arm926ejs") + (eq_attr "type" "call")) + "nothing") diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md new file mode 100644 index 00000000000..5ae6c41f75a --- /dev/null +++ b/gcc/config/arm/vfp.md @@ -0,0 +1,744 @@ +;; ARM VFP coprocessor Machine Description +;; Copyright (C) 2003 Free Software Foundation, Inc. +;; Written by CodeSourcery, LLC. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to the Free +;; Software Foundation, 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. */ + +;; Additional register numbers +(define_constants + [(VFPCC_REGNUM 95)] +) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Pipeline description +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_automaton "vfp11") + +;; There are 3 pipelines in the VFP11 unit. +;; +;; - A 8-stage FMAC pipeline (7 execute + writeback) with forward from +;; fourth stage for simple operations. +;; +;; - A 5-stage DS pipeline (4 execute + writeback) for divide/sqrt insns. +;; These insns also uses first execute stage of FMAC pipeline. +;; +;; - A 4-stage LS pipeline (execute + 2 memory + writeback) with forward from +;; second memory stage for loads. + +;; We do not model Write-After-Read hazards. +;; We do not do write scheduling with the arm core, so it is only necessary +;; to model the first stage of each pipeline +;; ??? Need to model LS pipeline properly for load/store multiple? +;; We do not model fmstat properly. This could be done by modeling pipelines +;; properly and defining an absence set between a dummy fmstat unit and all +;; other vfp units. + +(define_cpu_unit "fmac" "vfp11") + +(define_cpu_unit "ds" "vfp11") + +(define_cpu_unit "vfp_ls" "vfp11") + +;; The VFP "type" attributes differ from those used in the FPA model. +;; ffarith Fast floating point insns, eg. abs, neg, cpy, cmp. +;; farith Most arithmetic insns. +;; fmul Double precision multiply. +;; fdivs Single precision sqrt or division. +;; fdivd Double precision sqrt or division. +;; f_load Floating point load from memory. +;; f_store Floating point store to memory. +;; f_2_r Transfer vfp to arm reg. +;; r_2_f Transfer arm to vfp reg. + +(define_insn_reservation "vfp_ffarith" 4 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "ffarith")) + "fmac") + +(define_insn_reservation "vfp_farith" 8 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "farith")) + "fmac") + +(define_insn_reservation "vfp_fmul" 9 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "fmul")) + "fmac*2") + +(define_insn_reservation "vfp_fdivs" 19 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "fdivs")) + "ds*15") + +(define_insn_reservation "vfp_fdivd" 33 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "fdivd")) + "fmac+ds*29") + +;; Moves to/from arm regs also use the load/store pipeline. +(define_insn_reservation "vfp_fload" 4 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "f_load,r_2_f")) + "vfp_ls") + +(define_insn_reservation "vfp_fstore" 4 + (and (eq_attr "fpu" "vfp") + (eq_attr "type" "f_load,f_2_r")) + "vfp_ls") + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Insn pattern +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; SImode moves +;; ??? For now do not allow loading constants into vfp regs. This causes +;; problems because small constants get converted into adds. +(define_insn "*arm_movsi_vfp" + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r,r ,m,!w,r,!w,!w, U") + (match_operand:SI 1 "general_operand" "rI,K,mi,r,r,!w,!w,Ui,!w"))] + "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT + && ( s_register_operand (operands[0], SImode) + || s_register_operand (operands[1], SImode))" + "@ + mov%?\\t%0, %1 + mvn%?\\t%0, #%B1 + ldr%?\\t%0, %1 + str%?\\t%1, %0 + fmsr%?\\t%0, %1\\t%@ int + fmrs%?\\t%0, %1\\t%@ int + fcpys%?\\t%0, %1\\t%@ int + flds%?\\t%0, %1\\t%@ int + fsts%?\\t%1, %0\\t%@ int" + [(set_attr "predicable" "yes") + (set_attr "type" "*,*,load1,store1,r_2_f,f_2_r,ffarith,f_load,f_store") + (set_attr "pool_range" "*,*,4096,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,*,4084,*,*,*,*,1008,*")] +) + + +;; DImode moves + +(define_insn "*arm_movdi_vfp" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,o<>,w,r,w,w ,U") + (match_operand:DI 1 "di_operand" "rIK,mi,r ,r,w,w,Ui,w"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "* + switch (which_alternative) + { + case 0: case 1: case 2: + return (output_move_double (operands)); + case 3: + return \"fmdrr%?\\t%P0, %1\\t%@ int\"; + case 4: + return \"fmrrd%?\\t%0, %1\\t%@ int\"; + case 5: + return \"fcpyd%?\\t%P0, %P1\\t%@ int\"; + case 6: + return \"fldd%?\\t%P0, %1\\t%@ int\"; + case 7: + return \"fstd%?\\t%P1, %0\\t%@ int\"; + default: + abort (); + } + " + [(set_attr "type" "*,load2,store2,r_2_f,f_2_r,ffarith,f_load,f_store") + (set_attr "length" "8,8,8,4,4,4,4,4") + (set_attr "pool_range" "*,1020,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,1008,*,*,*,*,1008,*")] +) + + +;; SFmode moves + +(define_insn "*movsf_vfp" + [(set (match_operand:SF 0 "nonimmediate_operand" "=w,r,w ,U,r ,m,w,r") + (match_operand:SF 1 "general_operand" " r,w,UE,w,mE,r,w,r"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + && ( s_register_operand (operands[0], SFmode) + || s_register_operand (operands[1], SFmode))" + "@ + fmsr%?\\t%0, %1 + fmrs%?\\t%0, %1 + flds%?\\t%0, %1 + fsts%?\\t%1, %0 + ldr%?\\t%0, %1\\t%@ float + str%?\\t%1, %0\\t%@ float + fcpys%?\\t%0, %1 + mov%?\\t%0, %1\\t%@ float" + [(set_attr "predicable" "yes") + (set_attr "type" "r_2_f,f_2_r,ffarith,*,f_load,f_store,load1,store1") + (set_attr "pool_range" "*,*,1020,*,4096,*,*,*") + (set_attr "neg_pool_range" "*,*,1008,*,4080,*,*,*")] +) + + +;; DFmode moves + +(define_insn "*movdf_vfp" + [(set (match_operand:DF 0 "nonimmediate_soft_df_operand" "=w,r,r, m,w ,U,w,r") + (match_operand:DF 1 "soft_df_operand" " r,w,mF,r,UF,w,w,r"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "* + { + switch (which_alternative) + { + case 0: + return \"fmdrr%?\\t%P0, %Q1, %R1\"; + case 1: + return \"fmrrd%?\\t%Q0, %R0, %P1\"; + case 2: case 3: case 7: + return output_move_double (operands); + case 4: + return \"fldd%?\\t%P0, %1\"; + case 5: + return \"fstd%?\\t%P1, %0\"; + case 6: + return \"fcpyd%?\\t%P0, %P1\"; + default: + abort (); + } + } + " + [(set_attr "type" "r_2_f,f_2_r,ffarith,*,load2,store2,f_load,f_store") + (set_attr "length" "4,4,8,8,4,4,4,8") + (set_attr "pool_range" "*,*,1020,*,1020,*,*,*") + (set_attr "neg_pool_range" "*,*,1008,*,1008,*,*,*")] +) + + +;; Conditional move patterns + +(define_insn "*movsfcc_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r") + (if_then_else:SF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:SF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w") + (match_operand:SF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcpys%D3\\t%0, %2 + fcpys%d3\\t%0, %1 + fcpys%D3\\t%0, %2\;fcpys%d3\\t%0, %1 + fmsr%D3\\t%0, %2 + fmsr%d3\\t%0, %1 + fmsr%D3\\t%0, %2\;fmsr%d3\\t%0, %1 + fmrs%D3\\t%0, %2 + fmrs%d3\\t%0, %1 + fmrs%D3\\t%0, %2\;fmrs%d3\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,4,4,8,4,4,8") + (set_attr "type" "ffarith,ffarith,ffarith,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + +(define_insn "*movdfcc_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r") + (if_then_else:DF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:DF 1 "s_register_operand" "0,w,w,0,?r,?r,0,w,w") + (match_operand:DF 2 "s_register_operand" "w,0,w,?r,0,?r,w,0,w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcpyd%D3\\t%P0, %P2 + fcpyd%d3\\t%P0, %P1 + fcpyd%D3\\t%P0, %P2\;fcpyd%d3\\t%P0, %P1 + fmdrr%D3\\t%P0, %Q2, %R2 + fmdrr%d3\\t%P0, %Q1, %R1 + fmdrr%D3\\t%P0, %Q2, %R2\;fmdrr%d3\\t%P0, %Q1, %R1 + fmrrd%D3\\t%Q0, %R0, %P2 + fmrrd%d3\\t%Q0, %R0, %P1 + fmrrd%D3\\t%Q0, %R0, %P2\;fmrrd%d3\\t%Q0, %R0, %P1" + [(set_attr "conds" "use") + (set_attr "length" "4,4,8,4,4,8,4,4,8") + (set_attr "type" "ffarith,ffarith,ffarith,r_2_f,r_2_f,r_2_f,f_2_r,f_2_r,f_2_r")] +) + + +;; Sign manipulation functions + +(define_insn "*abssf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (abs:SF (match_operand:SF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fabss%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*absdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (abs:DF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fabsd%?\\t%P0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*negsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+w") + (neg:SF (match_operand:SF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnegs%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*negdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (neg:DF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnegd%?\\t%P0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + + +;; Arithmetic insns + +(define_insn "*addsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (plus:SF (match_operand:SF 1 "s_register_operand" "w") + (match_operand:SF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fadds%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*adddf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (plus:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "faddd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + + +(define_insn "*subsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (minus:SF (match_operand:SF 1 "s_register_operand" "w") + (match_operand:SF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsubs%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*subdf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsubd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + + +;; Division insns + +(define_insn "*divsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+w") + (div:SF (match_operand:SF 1 "s_register_operand" "w") + (match_operand:SF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fdivs%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivs")] +) + +(define_insn "*divdf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (div:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fdivd%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivd")] +) + + +;; Multiplication insns + +(define_insn "*mulsf3_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+w") + (mult:SF (match_operand:SF 1 "s_register_operand" "w") + (match_operand:SF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmuls%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*muldf3_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (mult:DF (match_operand:DF 1 "s_register_operand" "w") + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmuld%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + + +(define_insn "*mulsf3negsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "+w") + (mult:SF (neg:SF (match_operand:SF 1 "s_register_operand" "w")) + (match_operand:SF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmuls%?\\t%0, %1, %2" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*muldf3negdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "+w") + (mult:DF (neg:DF (match_operand:DF 1 "s_register_operand" "w")) + (match_operand:DF 2 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmuld%?\\t%P0, %P1, %P2" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + + +;; Multiply-accumulate insns + +;; 0 = 1 * 2 + 0 +(define_insn "*mulsf3addsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "w") + (match_operand:SF 3 "s_register_operand" "w")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmacs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*muldf3adddf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (plus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmacd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + +;; 0 = 1 * 2 - 0 +(define_insn "*mulsf3subsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (minus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "w") + (match_operand:SF 3 "s_register_operand" "w")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmscs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*muldf3subdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmscd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + +;; 0 = -(1 * 2) + 0 +(define_insn "*mulsf3negsfaddsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (minus:SF (match_operand:SF 1 "s_register_operand" "0") + (mult:SF (match_operand:SF 2 "s_register_operand" "w") + (match_operand:SF 3 "s_register_operand" "w"))))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmacs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*fmuldf3negdfadddf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (match_operand:DF 1 "s_register_operand" "0") + (mult:DF (match_operand:DF 2 "s_register_operand" "w") + (match_operand:DF 3 "s_register_operand" "w"))))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmacd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + + +;; 0 = -(1 * 2) - 0 +(define_insn "*mulsf3negsfsubsf_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (minus:SF (mult:SF + (neg:SF (match_operand:SF 2 "s_register_operand" "w")) + (match_operand:SF 3 "s_register_operand" "w")) + (match_operand:SF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmscs%?\\t%0, %2, %3" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*muldf3negdfsubdf_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (minus:DF (mult:DF + (neg:DF (match_operand:DF 2 "s_register_operand" "w")) + (match_operand:DF 3 "s_register_operand" "w")) + (match_operand:DF 1 "s_register_operand" "0")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fnmscd%?\\t%P0, %P2, %P3" + [(set_attr "predicable" "yes") + (set_attr "type" "fmul")] +) + + +;; Conversion routines + +(define_insn "*extendsfdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (float_extend:DF (match_operand:SF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fcvtds%?\\t%P0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*truncdfsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (float_truncate:SF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fcvtsd%?\\t%0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*truncsisf2_vfp" + [(set (match_operand:SI 0 "s_register_operand" "=w") + (fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "w"))))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "ftosizs%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*truncsidf2_vfp" + [(set (match_operand:SI 0 "s_register_operand" "=w") + (fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "w"))))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "ftosizd%?\\t%0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*floatsisf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (float:SF (match_operand:SI 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsitos%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + +(define_insn "*floatsidf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (float:DF (match_operand:SI 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsitod%?\\t%P0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "farith")] +) + + +;; Sqrt insns. + +(define_insn "*sqrtsf2_vfp" + [(set (match_operand:SF 0 "s_register_operand" "=w") + (sqrt:SF (match_operand:SF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsqrts%?\\t%0, %1" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivs")] +) + +(define_insn "*sqrtdf2_vfp" + [(set (match_operand:DF 0 "s_register_operand" "=w") + (sqrt:DF (match_operand:DF 1 "s_register_operand" "w")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fsqrtd%?\\t%P0, %P1" + [(set_attr "predicable" "yes") + (set_attr "type" "fdivd")] +) + + +;; Patterns to split/copy vfp condition flags. + +(define_insn "*movcc_vfp" + [(set (reg CC_REGNUM) + (reg VFPCC_REGNUM))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "fmstat%?" + [(set_attr "conds" "set") + (set_attr "type" "ffarith")] +) + +(define_insn_and_split "*cmpsf_split_vfp" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:SF 0 "s_register_operand" "w") + (match_operand:SF 1 "vfp_compare_operand" "wG")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_dup 0) + (match_dup 1))) + (set (reg:CCFP CC_REGNUM) + (reg:CCFP VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpsf_trap_split_vfp" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:SF 0 "s_register_operand" "w") + (match_operand:SF 1 "vfp_compare_operand" "wG")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_dup 0) + (match_dup 1))) + (set (reg:CCFPE CC_REGNUM) + (reg:CCFPE VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpdf_split_vfp" + [(set (reg:CCFP CC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "w") + (match_operand:DF 1 "vfp_compare_operand" "wG")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_dup 0) + (match_dup 1))) + (set (reg:CCFP CC_REGNUM) + (reg:CCFPE VFPCC_REGNUM))] + "" +) + +(define_insn_and_split "*cmpdf_trap_split_vfp" + [(set (reg:CCFPE CC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w") + (match_operand:DF 1 "vfp_compare_operand" "wG")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "#" + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_dup 0) + (match_dup 1))) + (set (reg:CCFPE CC_REGNUM) + (reg:CCFPE VFPCC_REGNUM))] + "" +) + + +;; Comparison patterns + +(define_insn "*cmpsf_vfp" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_operand:SF 0 "s_register_operand" "w,w") + (match_operand:SF 1 "vfp_compare_operand" "w,G")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmps%?\\t%0, %1 + fcmpzs%?\\t%0" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*cmpsf_trap_vfp" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_operand:SF 0 "s_register_operand" "w,w") + (match_operand:SF 1 "vfp_compare_operand" "w,G")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmpes%?\\t%0, %1 + fcmpezs%?\\t%0" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*cmpdf_vfp" + [(set (reg:CCFP VFPCC_REGNUM) + (compare:CCFP (match_operand:DF 0 "s_register_operand" "w,w") + (match_operand:DF 1 "vfp_compare_operand" "w,G")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmpd%?\\t%P0, %P1 + fcmpzd%?\\t%P0" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + +(define_insn "*cmpdf_trap_vfp" + [(set (reg:CCFPE VFPCC_REGNUM) + (compare:CCFPE (match_operand:DF 0 "s_register_operand" "w,w") + (match_operand:DF 1 "vfp_compare_operand" "w,G")))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "@ + fcmped%?\\t%P0, %P1 + fcmpezd%?\\t%P0" + [(set_attr "predicable" "yes") + (set_attr "type" "ffarith")] +) + + +;; Store multiple insn used in function prologue. + +(define_insn "*push_multi_vfp" + [(match_parallel 2 "multi_register_push" + [(set (match_operand:BLK 0 "memory_operand" "=m") + (unspec:BLK [(match_operand:DF 1 "s_register_operand" "w")] + UNSPEC_PUSH_MULT))])] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP" + "* return vfp_output_fstmx (operands);" + [(set_attr "type" "f_store")] +) + + +;; Unimplemented insns: +;; fldm* +;; fstm* +;; fmdhr et al (VFPv1) +;; Support for xD (single precision only) variants. +;; fmrrs, fmsrr +;; fuito* +;; ftoui* |