aboutsummaryrefslogtreecommitdiff
path: root/gcc/doc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/doc')
-rw-r--r--gcc/doc/extend.texi282
-rw-r--r--gcc/doc/invoke.texi198
-rw-r--r--gcc/doc/mxp.texi106
-rw-r--r--gcc/doc/tm.texi13
4 files changed, 575 insertions, 24 deletions
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 43e91afe8b4..128370a0ccf 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6984,6 +6984,7 @@ instructions, but allow the compiler to schedule those calls.
@menu
* Alpha Built-in Functions::
+* ARC Built-in Functions::
* ARM iWMMXt Built-in Functions::
* ARM NEON Intrinsics::
* Blackfin Built-in Functions::
@@ -7081,6 +7082,287 @@ void *__builtin_thread_pointer (void)
void __builtin_set_thread_pointer (void *)
@end smallexample
+@node ARC Built-in Functions
+@subsection ARC Built-in Functions
+
+SIMD instruction can be generated for ARC, using the built-in functions provided
+for the ARC cores when the @option{-msimd} switch is used:
+
+The set of builtins defined for ARC can be categorized according to their
+signatures into the following types:
+
+@smallexample
+I) Return type : v8hi
+ First argument : v8hi
+ Second argument: v8hi
+
+v8hi __builtin_arc_vaddaw (v8hi, v8hi)
+v8hi __builtin_arc_vaddw (v8hi, v8hi)
+v8hi __builtin_arc_vavb (v8hi, v8hi)
+v8hi __builtin_arc_vavrb (v8hi, v8hi)
+v8hi __builtin_arc_vdifaw (v8hi, v8hi)
+v8hi __builtin_arc_vdifw (v8hi, v8hi)
+v8hi __builtin_arc_vmaxaw (v8hi, v8hi)
+v8hi __builtin_arc_vmaxw (v8hi, v8hi)
+v8hi __builtin_arc_vminaw (v8hi, v8hi)
+v8hi __builtin_arc_vminw (v8hi, v8hi)
+v8hi __builtin_arc_vmulaw (v8hi, v8hi)
+v8hi __builtin_arc_vmulfaw (v8hi, v8hi)
+v8hi __builtin_arc_vmulfw (v8hi, v8hi)
+v8hi __builtin_arc_vmulw (v8hi, v8hi)
+v8hi __builtin_arc_vsubaw (v8hi, v8hi)
+v8hi __builtin_arc_vsubw (v8hi, v8hi)
+v8hi __builtin_arc_vsummw (v8hi, v8hi)
+v8hi __builtin_arc_vand (v8hi, v8hi)
+v8hi __builtin_arc_vandaw (v8hi, v8hi)
+v8hi __builtin_arc_vbic (v8hi, v8hi)
+v8hi __builtin_arc_vbicaw (v8hi, v8hi)
+v8hi __builtin_arc_vor (v8hi, v8hi)
+v8hi __builtin_arc_vxor (v8hi, v8hi)
+v8hi __builtin_arc_vxoraw (v8hi, v8hi)
+v8hi __builtin_arc_veqw (v8hi, v8hi)
+v8hi __builtin_arc_vlew (v8hi, v8hi)
+v8hi __builtin_arc_vltw (v8hi, v8hi)
+v8hi __builtin_arc_vnew (v8hi, v8hi)
+v8hi __builtin_arc_vmr1aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr1w (v8hi, v8hi)
+v8hi __builtin_arc_vmr2aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr2w (v8hi, v8hi)
+v8hi __builtin_arc_vmr3aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr3w (v8hi, v8hi)
+v8hi __builtin_arc_vmr4aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr4w (v8hi, v8hi)
+v8hi __builtin_arc_vmr5aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr5w (v8hi, v8hi)
+v8hi __builtin_arc_vmr6aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr6w (v8hi, v8hi)
+v8hi __builtin_arc_vmr7aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr7w (v8hi, v8hi)
+v8hi __builtin_arc_vmrb (v8hi, v8hi)
+v8hi __builtin_arc_vh264f (v8hi, v8hi)
+v8hi __builtin_arc_vh264ft (v8hi, v8hi)
+v8hi __builtin_arc_vh264fw (v8hi, v8hi)
+v8hi __builtin_arc_vvc1f (v8hi, v8hi)
+v8hi __builtin_arc_vvc1ft (v8hi, v8hi)
+@end smallexample
+
+@smallexample
+II) Return type : v8hi
+ First argument : v8hi
+ Second argument: int
+
+v8hi __builtin_arc_vbaddw (v8hi, int)
+v8hi __builtin_arc_vbmaxw (v8hi, int)
+v8hi __builtin_arc_vbminw (v8hi, int)
+v8hi __builtin_arc_vbmulaw (v8hi, int)
+v8hi __builtin_arc_vbmulfw (v8hi, int)
+v8hi __builtin_arc_vbmulw (v8hi, int)
+v8hi __builtin_arc_vbrsubw (v8hi, int)
+v8hi __builtin_arc_vbsubw (v8hi, int)
+@end smallexample
+
+@smallexample
+III) Return type : v8hi
+ First argument : v8hi
+ Second argument: const int
+
+ The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicate the registers I0-I7:
+
+v8hi __builtin_arc_vasrw (v8hi, const int)
+v8hi __builtin_arc_vsr8 (v8hi, const int)
+v8hi __builtin_arc_vsr8aw (v8hi, const int)
+@end smallexample
+
+@smallexample
+IV) Return type : v8hi
+ First argument : v8hi
+ Second argument: const int
+
+ The second argument in these builtins has to be an unsigned 6-bit
+integer constant:
+
+v8hi __builtin_arc_vasrrwi (v8hi, const int)
+v8hi __builtin_arc_vasrsrwi (v8hi, const int)
+v8hi __builtin_arc_vasrwi (v8hi, const int)
+v8hi __builtin_arc_vasrpwbi (v8hi, const int)
+v8hi __builtin_arc_vasrrpwbi (v8hi, const int)
+v8hi __builtin_arc_vsr8awi (v8hi, const int)
+v8hi __builtin_arc_vsr8i (v8hi, const int)
+@end smallexample
+
+@smallexample
+V) Return type : v8hi
+ First argument : v8hi
+ Second argument: const int
+
+ The second argument in these builtins has to be an unsigned 8-bit
+integer constant:
+
+v8hi __builtin_arc_vmvaw (v8hi, const int)
+v8hi __builtin_arc_vmvw (v8hi, const int)
+v8hi __builtin_arc_vmvzw (v8hi, const int)
+v8hi __builtin_arc_vd6tapf (v8hi, const int)
+@end smallexample
+
+@smallexample
+VI) Return type : v8hi
+ First argument : int
+ Second argument: const int
+
+ The second argument in these builtins has to be an unsigned 8-bit
+integer constant:
+
+v8hi __builtin_arc_vmovaw (int, const int)
+v8hi __builtin_arc_vmovw (int, const int)
+v8hi __builtin_arc_vmovzw (int, const int)
+@end smallexample
+
+@smallexample
+VII) Return type : v8hi
+ First argument : v8hi
+
+v8hi __builtin_arc_vabsaw (v8hi)
+v8hi __builtin_arc_vabsw (v8hi)
+v8hi __builtin_arc_vaddsuw (v8hi)
+v8hi __builtin_arc_vsignw (v8hi)
+v8hi __builtin_arc_vexch1 (v8hi)
+v8hi __builtin_arc_vexch2 (v8hi)
+v8hi __builtin_arc_vexch4 (v8hi)
+v8hi __builtin_arc_vupbaw (v8hi)
+v8hi __builtin_arc_vupbw (v8hi)
+v8hi __builtin_arc_vupsbaw (v8hi)
+v8hi __builtin_arc_vupsbw (v8hi)
+@end smallexample
+
+@smallexample
+VIII) Return type : void
+ First argument : int
+ Second argument : int
+
+void __builtin_arc_vdirun (int, int)
+void __builtin_arc_vdorun (int, int)
+@end smallexample
+
+@smallexample
+IX) Return type : void
+ First argument : const int
+ Second argument : int
+
+ The first argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates DR0-DR7 DMA channel setup registers. The file
+arc-simd.h also profides defines which can be used in place of the DMA register
+numbers to facilitate better code readability:
+
+void __builtin_arc_vdiwr (const int, int)
+void __builtin_arc_vdowr (const int, int)
+@end smallexample
+
+@smallexample
+X) Return type : void
+ First argument : int
+
+void __builtin_arc_vrec (int)
+void __builtin_arc_vrun (int)
+void __builtin_arc_vrecrun (int)
+void __builtin_arc_vendrec (int)
+@end smallexample
+
+@smallexample
+XI) Return type : v8hi
+ First argument : v8hi
+ Second argument : const int
+ Third argument : const int
+
+ The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The third argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+v8hi __builtin_arc_vld32wh (v8hi, const int, const int)
+v8hi __builtin_arc_vld32wl (v8hi, const int, const int)
+v8hi __builtin_arc_vld64 (v8hi, const int, const int)
+v8hi __builtin_arc_vld32 (v8hi, const int, const int)
+
+NOTE: Although the equivalent hardware instructions do not take a simd register
+ as an operand, these builtins overwrite the relevant bits of the v8hi
+ quantity provided as the first argument with the value loaded from
+ [Ib, u8] location in the SDM.
+
+@end smallexample
+
+@smallexample
+XII) Return type : v8hi
+ First argument : const int
+ Second argument : const int
+
+ The first argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The second argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+v8hi __builtin_arc_vld64w (const int, const int)
+v8hi __builtin_arc_vld128 (const int, const int)
+@end smallexample
+
+@smallexample
+XIII) Return type : void
+ First argument : v8hi
+ Second argument : const int
+ Third argument : const int
+
+ The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The third argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+void __builtin_arc_vst128 (v8hi, const int, const int)
+void __builtin_arc_vst64 (v8hi, const int, const int)
+@end smallexample
+
+
+@smallexample
+XIV) Return type : void
+ First argument : v8hi
+ Second argument : const int
+ Third argument : const int
+
+ The second argument has to be an unsigned 3-bit quantity to identify the
+16-bit subregister to be stored. The third argument in these builtins has to be
+an unsigned 3-bit integer constant, as it indicates I0-I7 registers. The fourth
+argument has to be an unsigned 8-bit quantity The file arc-simd.h also profides
+defines which can be used in place of the I0-I7 registe numbers to facilitate
+better code readability:
+
+void __builtin_arc_vst16_n (v8hi, const int, const int, const int)
+void __builtin_arc_vst32_n (v8hi, const int, const int, const int)
+@end smallexample
+
+
+@smallexample
+XIV) Return type : void
+ First argument : const int
+
+ The argument has to be an unsigned 6-bit quantity.
+
+void __builtin_arc_vinti (const int)
+@end smallexample
+
+@smallexample
+NOTE: For all builtins __builtin_arc_<someinsn>, the header file arc-simd.h also
+ provides macros called _<someinsn> which can be used for programming ease
+ and improved readability.
+
+ Besides these, the following extra defines and typedefs are also provided
+in the header file
+
+#define _setup_dma_in_channel_reg _vdiwr
+#define _setup_dma_out_channel_reg _vdowr
+
+typedef int __v4si __attribute__((vector_size(16)));
+typedef short __v8hi __attribute__((vector_size(16)));
+@end smallexample
+
@node ARM iWMMXt Built-in Functions
@subsection ARM iWMMXt Built-in Functions
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7e6da15515d..4efc8c2871c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -427,8 +427,33 @@ Objective-C and Objective-C++ Dialects}.
@emph{ARC Options}
@gccoptlist{-EB -EL @gol
--mmangle-cpu -mcpu=@var{cpu} -mtext=@var{text-section} @gol
--mdata=@var{data-section} -mrodata=@var{readonly-data-section}}
+-mbig-endian -mlittle-endian @gol
+-mA4 -mA5 -mA6 -mARC600 -mA7 -mARC700 -mmixed-code @gol
+-mtext=@var{text-section} -mdata=@var{data-section} @gol
+-mrodata=@var{readonly-data-section} @gol
+-malign-loops -mno-align-loops @gol
+-mvolatile-cache -mno-volatile-cache @gol
+-mno-cond-exec @gol
+-mnorm @gol
+-mswap @gol
+-mbarrel_shifter @gol
+-mmul64 @gol
+-mmin_max @gol
+-mEA @gol
+-msoft-float @gol
+-mno-mpy @gol
+-mno-brcc @gol
+-mlong-calls @gol
+-mno-sdata @gol
+-mno-millicode @gol
+-mspfp @gol
+-mspfp_compact @gol
+-mspfp_fast @gol
+-mdpfp @gol
+-mdpfp_compact @gol
+-mdpfp_fast @gol
+-msimd @gol
+}
@emph{ARM Options}
@gccoptlist{-mapcs-frame -mno-apcs-frame @gol
@@ -8684,44 +8709,162 @@ These options are defined for ARC implementations:
@table @gcctabopt
@item -EL
@opindex EL
+@itemx -mlittle-endian
+@opindex mlittle-endian
Compile code for little endian mode. This is the default.
@item -EB
@opindex EB
+@itemx -mbig-endian
+@opindex mbig-endian
Compile code for big endian mode.
-@item -mmangle-cpu
-@opindex mmangle-cpu
-Prepend the name of the cpu to all public symbol names.
-In multiple-processor systems, there are many ARC variants with different
-instruction and register set characteristics. This flag prevents code
-compiled for one cpu to be linked with code compiled for another.
-No facility exists for handling variants that are ``almost identical''.
-This is an all or nothing option.
+@item -mA4
+@opindex mA4
+Generates code for ARCtangent-A4 processor. This is the default.
-@item -mcpu=@var{cpu}
-@opindex mcpu
-Compile code for ARC variant @var{cpu}.
-Which variants are supported depend on the configuration.
-All variants support @option{-mcpu=base}, this is the default.
+@item -mA5
+@opindex mA5
+Generates ARCompact 32-bit code for ARCtangent-A5 processor.
+
+@item -mA6
+@opindex mA6
+@itemx -mARC600
+@opindex mARC600
+Generates ARCompact 32-bit code for ARCtangent-ARC600 processor.
+
+@item -mA7
+@opindex mA7
+@itemx -mARC700
+@opindex mARC700
+Generates ARCompact 32-bit code for ARCtangent-ARC700 processor.
+
+@item -mmixed-code
+@opindex mmixed-code
+Generates ARCompact 16-bit instructions intermixed with 32-bit instructions
+for ARCtangent-A5 and higher processors.
@item -mtext=@var{text-section}
@itemx -mdata=@var{data-section}
@itemx -mrodata=@var{readonly-data-section}
-@opindex mtext
-@opindex mdata
-@opindex mrodata
+@opindex mtext=@var{text-section}
+@opindex mdata=@var{data-section}
+@opindex mrodata=@var{readonly-data-section}
Put functions, data, and readonly data in @var{text-section},
@var{data-section}, and @var{readonly-data-section} respectively
by default. This can be overridden with the @code{section} attribute.
@xref{Variable Attributes}.
-@item -mfix-cortex-m3-ldrd
-@opindex mfix-cortex-m3-ldrd
-Some Cortex-M3 cores can cause data corruption when @code{ldrd} instructions
-with overlapping destination and base registers are used. This option avoids
-generating these instructions. This option is enabled by default when
-@option{-mcpu=cortex-m3} is specified.
+@item -malign-loops
+@opindex malign-loops
+Align loop starts to 32-byte boundaries (cache line size).
+
+@item -malign-loops
+@opindex malign-loops
+Do not align loop starts to 32-byte boundaries (cache line size).
+
+@item -mvolatile-cache
+@opindex mvolatile-cache
+Allow caching of volatile references. This is the default.
+
+@item -mno-valatile-cache
+@opindex mno-volatile-cache
+Do not cache volatile references.
+
+@item -mno-cond-exec
+@opindex mno-cond-exec
+Do not generate predicated instructions for conditional execution.
+
+@item -mnorm
+@opindex mnorm
+Allow generation of norm instruction through the use of builtins. For
+ARC700, the -mnorm option is turned on by default.
+
+@item -mswap
+@opindex mswap
+Allow generation of swap instruction through the use of builtins. For
+ARC700, the -mswap option is turned on by default.
+
+@item -mbarrel_shifter
+@opindex mbarrel_shifter
+Allow generation of multiple shift instruction supported by barrel
+shifter unit. For post A4 cores, such as A5, ARC600, ARC700, the
+-mbarrel_shifter option is turned on by default.
+
+@item -mmul64
+@opindex mmul64
+Allow generation of mul64 and mulu64 instructions, by using
+builtins. This option is not allowed for ARC700.
+
+@item -mmin_max
+@opindex mmin_max
+Allow generation of min and max instructions for A4. For post A4
+cores, these are generated by default.
+
+@item -mno-mpy
+@opindex mno-mpy
+Disallow generation of mpy mpyh, mpyhu, mpyu instructions for ARC700. This
+option is allowed only for ARC700 processor.
+
+@item -mEA
+@opindex mEA
+Allow generation of extended arithmetic instructions.
+
+@item -msoft-float
+@opindex msoft-float
+Dummy flag. Many applications use this flag generically, and soft-floats
+are the only option on ARC.
+
+@item -mno-brcc
+@opindex mno-brcc
+Disable generation of BRcc instructions.
+
+@item -mlong-calls
+@opindex mlong-calls
+Make all function calls as register-indirect. This flag can be overridden
+by using the @samp{short_call} function attribute.
+
+@item -mno-sdata
+@opindex mno-sdata
+Do not generate sdata references
+
+@item -mno-millicode
+@opindex mno-millicode
+Do not generate millicode thunk code for saving and restoring registers in
+functions' prologue/epilogue. This flags is needed only with -Os, since millicode
+thunks are used only when optimizing for size..
+
+@end table
+
+@subsection FPX Options
+@cindex ARC FPX Options
+These options can be used to generate code for the FPX (Floating Point
+eXtension) extension unit.
+
+@table @gcctabopt
+@item -mspfp
+@opindex mspfp
+@itemx -mspfp_compact
+@opindex mspfp_compact
+Generate Single Precision FPX (compact) instructions
+
+@item -mspfp_fast
+@opindex mspfp_fast
+Generate Single Precision FPX (fast) instructions
+
+@item -mdpfp
+@opindex mdpfp
+@itemx -mdpfp_compact
+@opindex mdpfp_compact
+Generate Double Precision FPX (compact) instructions
+
+@item -mdpfp_fast
+@opindex mdpfp_fast
+Generate Double Precision FPX (fast) instructions
+
+@item -msimd
+@opindex msimd
+Enable generation of ARC SIMD instructions via target-specific builtins.
@end table
@@ -8733,6 +8876,13 @@ These @samp{-m} options are defined for Advanced RISC Machines (ARM)
architectures:
@table @gcctabopt
+@item -mfix-cortex-m3-ldrd
+@opindex mfix-cortex-m3-ldrd
+Some Cortex-M3 cores can cause data corruption when @code{ldrd} instructions
+with overlapping destination and base registers are used. This option avoids
+generating these instructions. This option is enabled by default when
+@option{-mcpu=cortex-m3} is specified.
+
@item -mabi=@var{name}
@opindex mabi
Generate code for the specified ABI@. Permissible values are: @samp{apcs-gnu},
diff --git a/gcc/doc/mxp.texi b/gcc/doc/mxp.texi
new file mode 100644
index 00000000000..69cef2657da
--- /dev/null
+++ b/gcc/doc/mxp.texi
@@ -0,0 +1,106 @@
+data/bss layout: uses different sections ordered by minimum addressing scale.
+no separate .rodata section(s).
+.data16: scaling factor 16
+.data8, .data4, data2, .data1: likewise for smaller scaling factors
+.bss1, .bss2, .bss4, .bss8, .bss16: bss sections for increasing scaling
+factors
+The data base pointer register i9 typically points at the place where .bss1
+ends and .data1 starts. It might be moved up or down if allocation
+would otherwise overflow on one side, and on the other side is slack.
+
+Tasks to be done:
+- Convert this document into a proper texinfo file, incorporate it into
+ gcc ducumentation, and test 'make info'
+- binutils support for using undefined labels in mxp data/bss sections
+ as offsets in memory addresses.
+- binutils support for mxp code labels. For a start, we are looking to
+ have a special text section where to put all the mxp code. At link time,
+ this special text section is considered to be loaded at the start of the
+ SCM for purposes of resolving SCM absolute relocations. However, the
+ code gets actually a different load address for the ARC700 core, and gets
+ a j_s [blink] instruction appended (extra points if you make this a j_s.d
+ [blink] before the last insn without the potential to break stuff...)
+ Later we will likely want to move to multiple of such special text sections
+ to handle overlays, and possibly also have different load addreses to
+ accomodate multiple overlays. If we want to be able to handle SCM PIE,
+ I.e. code that can be loaded to varying SCM locations, the arc will need
+ to load an a core register with the SCM load address before calling the
+ SCQ loading code, and the latter will have to use add instructions to
+ calculate SCM locations on the fly.
+ No matter if we use such add instructions, or long immediates, instructions
+ that reference SCM memory locations work out as 64 bit of code on the
+ arc side, while the other SIMD instructions are injected with a single
+ 32 bit code from the arc side. Thus we have a discrepancy between the
+ space taken up by the instructions in the object file and the size we
+ have to consider for purposes of calculating SCM addresses.
+ Luckily, these differences are constant from the first time the SIMD
+ assembly is emitted. Thus, the total number of instructions
+ with SCM references that precede an SCM label gives us the number of
+ 32 bit words to subtract from the total number of preceding 32 bits words
+ to arrive at the offset from the SCM load address.
+ To account for preceding SCM references in the same module, we can make
+ the SCM label appear to be accordingly earlier in the module.
+ (This will have to be compensated for if we want to do any linktime
+ relaxation at some later point in time.)
+ We also need to keep a tally of the total number of SCM references in each
+ module.
+ When linking multiple modules together, the total of these tallies for all
+ preceding modules needs to be added up, and subtracted from the value of
+ each label.
+ Like SCM references, (other) long immediates bulk up the code on the arc
+ side while leaving the SIMD instruction count the same, so they have to
+ be tallied up together with the SCM references.
+- library functions:
+ - divsi3: use sh64 code as starting point. Note that there is no
+ point in loading the table base address before the function call, because
+ all SCM memory addressing has an offset.
+ divv8hi3, divv4si3: use older sh64 code w/out lookup table as starting
+ point
+ - divhi3
+- Investigate register class preferencing issues. Naming lane sets with
+ lane 0 first actually results in the wrong reg_class_subunions. In theory
+ the ordierng should be something like 00, 10, 01, 30, 03, ff, to get the
+ sets with lane zero prefered for subunions. preferred classes can be
+ seen in the *lreg dump file after compiling with -da. Another avenue to
+ saner subunions is to add proper union lane sets 11, 33.
+ The paradoxical thing I am seeing here is that the instruction count for
+ muldi increases when I introduce these measures.
+ Another - or complimentary - approach is to shift the cost balance.
+ in theory REGISTER_MOVE_COST should have an influence, but in practice
+ I haven't seen any. What works is adding extra cost to insn alternatives
+ which allow non-lane0 registers. A problem here - and in general - is that
+ we want a viable alternate register class. Jacking up the cost for
+ non-lane0 alternatives can disparage these to the point that we loose the
+ altclass. We also have often altclasses that don't actually contain any
+ extra valid registers. In theory increasing MEMORY_MOVE_COST can
+ compensate, however I see paradoxical outcomes when I try to make this
+ dependent on !(reload_in_progress || reload_completed). I have a diff
+ for some of the changes I've tried in
+ /home/joernr/prefclass-experiments-20080428.
+ Maybe we ned to jackup REGISTER_MOVE_COST, MEMORY_MOVE_COST and RTX_COST
+ consistently to get a more fine-grained resolution of costs.
+- Obtain code samples of code that we think is suitable and relevant for
+ autovectorization. E.g. some codec.
+ Dependent tasks:
+ - Identify the actual section of this code that we think we should be
+ able to autovectorize.
+ - Make sure autovectorization takes place.
+- Partitioning work. Check with IBM Haifa and other Milepost partners
+ what they already have.
+ Inasmuch as not already done:
+ - Identify individual functions and subgraphs of the callgraph we can move
+ to the SIMD engine.
+ - Add code to tree loop analysis to break out loops that we can move to
+ the SIMD engine.
+ - Handle data sets that don't fit into SDM. The simplest to implement
+ approach is probably to do loop tiling at the interface between arc core
+ and simd engine. OTOH we can get much better parallelism if we hand
+ over the entire work to the simd engine and let it DMA out the previoud
+ block, and DMA in the next block, while it is performing calculations.
+ For this we need to represent main memory pointers.
+ Need not necessarilty be exposed as pointers to the mxp-gcc, we could
+ express the loop tiling with intrinsics.
+- Add doloop pattern
+- Convert multi-insn define_insn patterns into define_insn_and_split patterns.
+- Add scheduler description
+- Where missing, add comments to the code according to GNU coding standards.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 7dfb46b3a0d..5e9a2792337 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -2758,6 +2758,12 @@ Do not define this macro if you do not define
is @code{BITS_PER_WORD} bits wide is correct for your machine.
@end defmac
+@deftypefn {Target Hook} bool TARGET_PRESERVE_RELOAD_P (rtx @var{in})
+Called when doing an input reload using the value @var{in}. Return true
+if the reload register should be available for inheritance later. This
+might increase the spill pressure, but enhances reload inheritance.
+@end deftypefn
+
@defmac SMALL_REGISTER_CLASSES
On some machines, it is risky to let hard registers live across arbitrary
insns. Typically, these machines have instructions that require values
@@ -5962,6 +5968,13 @@ will be used. Defaults to 1 if @code{move_by_pieces_ninsns} returns less
than @code{MOVE_RATIO}.
@end defmac
+@defmac CAN_MOVE_BY_PIECES (@var{size}, @var{alignment})
+A C expression used to determine whether a chunk of memory is to be copied
+in pieces either by @code{move_by_pieces}, or by a movmem expander. This
+is used by other optimizers that want to anticipate how a block copy is
+going to be done. If not defined, MOVE_BY_PIECES_P is used instead.
+@end defmac
+
@defmac MOVE_MAX_PIECES
A C expression used by @code{move_by_pieces} to determine the largest unit
a load or store used to copy memory is. Defaults to @code{MOVE_MAX}.