4 files changed, 575 insertions, 24 deletions
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 43e91afe8b4..128370a0ccf 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6984,6 +6984,7 @@ instructions, but allow the compiler to schedule those calls.
 
 @menu
 * Alpha Built-in Functions::
+* ARC Built-in Functions::
 * ARM iWMMXt Built-in Functions::
 * ARM NEON Intrinsics::
 * Blackfin Built-in Functions::
@@ -7081,6 +7082,287 @@ void *__builtin_thread_pointer (void)
 void __builtin_set_thread_pointer (void *)
 @end smallexample
 
+@node ARC Built-in Functions
+@subsection ARC Built-in Functions
+
+SIMD instruction can be generated for ARC, using the built-in functions provided
+for the ARC cores when the @option{-msimd} switch is used:
+
+The set of builtins defined for ARC can be categorized according to their
+signatures into the following types:
+
+@smallexample
+I) Return type    :  v8hi
+   First argument :  v8hi
+   Second argument:  v8hi
+
+v8hi __builtin_arc_vaddaw (v8hi, v8hi)
+v8hi __builtin_arc_vaddw (v8hi, v8hi)
+v8hi __builtin_arc_vavb (v8hi, v8hi)
+v8hi __builtin_arc_vavrb (v8hi, v8hi)
+v8hi __builtin_arc_vdifaw (v8hi, v8hi)
+v8hi __builtin_arc_vdifw (v8hi, v8hi)
+v8hi __builtin_arc_vmaxaw (v8hi, v8hi)
+v8hi __builtin_arc_vmaxw (v8hi, v8hi)
+v8hi __builtin_arc_vminaw (v8hi, v8hi)
+v8hi __builtin_arc_vminw (v8hi, v8hi)
+v8hi __builtin_arc_vmulaw (v8hi, v8hi)
+v8hi __builtin_arc_vmulfaw (v8hi, v8hi)
+v8hi __builtin_arc_vmulfw (v8hi, v8hi)
+v8hi __builtin_arc_vmulw (v8hi, v8hi)
+v8hi __builtin_arc_vsubaw (v8hi, v8hi)
+v8hi __builtin_arc_vsubw (v8hi, v8hi)
+v8hi __builtin_arc_vsummw (v8hi, v8hi)
+v8hi __builtin_arc_vand (v8hi, v8hi)
+v8hi __builtin_arc_vandaw (v8hi, v8hi)
+v8hi __builtin_arc_vbic (v8hi, v8hi)
+v8hi __builtin_arc_vbicaw (v8hi, v8hi)
+v8hi __builtin_arc_vor (v8hi, v8hi)
+v8hi __builtin_arc_vxor (v8hi, v8hi)
+v8hi __builtin_arc_vxoraw (v8hi, v8hi)
+v8hi __builtin_arc_veqw (v8hi, v8hi)
+v8hi __builtin_arc_vlew (v8hi, v8hi)
+v8hi __builtin_arc_vltw (v8hi, v8hi)
+v8hi __builtin_arc_vnew (v8hi, v8hi)
+v8hi __builtin_arc_vmr1aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr1w (v8hi, v8hi)
+v8hi __builtin_arc_vmr2aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr2w (v8hi, v8hi)
+v8hi __builtin_arc_vmr3aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr3w (v8hi, v8hi)
+v8hi __builtin_arc_vmr4aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr4w (v8hi, v8hi)
+v8hi __builtin_arc_vmr5aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr5w (v8hi, v8hi)
+v8hi __builtin_arc_vmr6aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr6w (v8hi, v8hi)
+v8hi __builtin_arc_vmr7aw (v8hi, v8hi)
+v8hi __builtin_arc_vmr7w (v8hi, v8hi)
+v8hi __builtin_arc_vmrb (v8hi, v8hi)
+v8hi __builtin_arc_vh264f (v8hi, v8hi)
+v8hi __builtin_arc_vh264ft (v8hi, v8hi)
+v8hi __builtin_arc_vh264fw (v8hi, v8hi)
+v8hi __builtin_arc_vvc1f (v8hi, v8hi)
+v8hi __builtin_arc_vvc1ft (v8hi, v8hi)
+@end smallexample
+
+@smallexample
+II)  Return type    :  v8hi
+     First argument :  v8hi
+     Second argument:  int
+
+v8hi __builtin_arc_vbaddw (v8hi, int)
+v8hi __builtin_arc_vbmaxw (v8hi, int)
+v8hi __builtin_arc_vbminw (v8hi, int)
+v8hi __builtin_arc_vbmulaw (v8hi, int)
+v8hi __builtin_arc_vbmulfw (v8hi, int)
+v8hi __builtin_arc_vbmulw (v8hi, int)
+v8hi __builtin_arc_vbrsubw (v8hi, int)
+v8hi __builtin_arc_vbsubw (v8hi, int)
+@end smallexample
+
+@smallexample
+III)  Return type    :  v8hi
+      First argument :  v8hi
+      Second argument:  const int
+
+	The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicate the registers I0-I7:
+
+v8hi __builtin_arc_vasrw (v8hi, const int)
+v8hi __builtin_arc_vsr8 (v8hi, const int)
+v8hi __builtin_arc_vsr8aw (v8hi, const int)
+@end smallexample
+
+@smallexample
+IV)  Return type    :  v8hi
+     First argument :  v8hi
+     Second argument:  const int
+
+	The second argument in these builtins has to be an unsigned 6-bit
+integer constant:
+
+v8hi __builtin_arc_vasrrwi (v8hi, const int)
+v8hi __builtin_arc_vasrsrwi (v8hi, const int)
+v8hi __builtin_arc_vasrwi (v8hi, const int)
+v8hi __builtin_arc_vasrpwbi (v8hi, const int)
+v8hi __builtin_arc_vasrrpwbi (v8hi, const int)
+v8hi __builtin_arc_vsr8awi (v8hi, const int)
+v8hi __builtin_arc_vsr8i (v8hi, const int)
+@end smallexample
+
+@smallexample
+V)  Return type    :  v8hi
+    First argument :  v8hi
+    Second argument:  const int
+
+	The second argument in these builtins has to be an unsigned 8-bit
+integer constant:
+
+v8hi __builtin_arc_vmvaw (v8hi, const int)
+v8hi __builtin_arc_vmvw (v8hi, const int)
+v8hi __builtin_arc_vmvzw (v8hi, const int)
+v8hi __builtin_arc_vd6tapf (v8hi, const int)
+@end smallexample
+
+@smallexample
+VI)  Return type    :  v8hi
+     First argument :  int
+     Second argument:  const int
+
+	The second argument in these builtins has to be an unsigned 8-bit
+integer constant:
+
+v8hi __builtin_arc_vmovaw (int, const int)
+v8hi __builtin_arc_vmovw (int, const int)
+v8hi __builtin_arc_vmovzw (int, const int)
+@end smallexample
+
+@smallexample
+VII)  Return type    :  v8hi
+      First argument :  v8hi
+
+v8hi __builtin_arc_vabsaw (v8hi)
+v8hi __builtin_arc_vabsw (v8hi)
+v8hi __builtin_arc_vaddsuw (v8hi)
+v8hi __builtin_arc_vsignw (v8hi)
+v8hi __builtin_arc_vexch1 (v8hi)
+v8hi __builtin_arc_vexch2 (v8hi)
+v8hi __builtin_arc_vexch4 (v8hi)
+v8hi __builtin_arc_vupbaw (v8hi)
+v8hi __builtin_arc_vupbw (v8hi)
+v8hi __builtin_arc_vupsbaw (v8hi)
+v8hi __builtin_arc_vupsbw (v8hi)
+@end smallexample
+
+@smallexample
+VIII)  Return type     :  void
+       First argument  :  int
+       Second argument :  int
+
+void __builtin_arc_vdirun (int, int)
+void __builtin_arc_vdorun (int, int)
+@end smallexample
+
+@smallexample
+IX)  Return type     :  void
+     First argument  :  const int
+     Second argument :  int
+
+	The first argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates DR0-DR7 DMA channel setup registers. The file
+arc-simd.h also profides defines which can be used in place of the DMA register
+numbers to facilitate better code readability:
+
+void __builtin_arc_vdiwr (const int, int)
+void __builtin_arc_vdowr (const int, int)
+@end smallexample
+
+@smallexample
+X)  Return type     :  void
+    First argument  :  int
+
+void __builtin_arc_vrec (int)
+void __builtin_arc_vrun (int)
+void __builtin_arc_vrecrun (int)
+void __builtin_arc_vendrec (int)
+@end smallexample
+
+@smallexample
+XI)  Return type      :  v8hi
+     First argument   :  v8hi
+     Second argument  :  const int
+     Third argument   :  const int
+
+	The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The third argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+v8hi __builtin_arc_vld32wh (v8hi, const int, const int)
+v8hi __builtin_arc_vld32wl (v8hi, const int, const int)
+v8hi __builtin_arc_vld64 (v8hi, const int, const int)
+v8hi __builtin_arc_vld32 (v8hi, const int, const int)
+
+NOTE: Although the equivalent hardware instructions do not take a simd register
+      as an operand, these builtins overwrite the relevant bits of the v8hi
+      quantity provided as the first argument with the value loaded from 
+      [Ib, u8] location in the SDM.
+
+@end smallexample
+
+@smallexample
+XII)  Return type      :  v8hi
+      First argument   :  const int
+      Second argument  :  const int
+
+	The first argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The second argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+v8hi __builtin_arc_vld64w (const int, const int)
+v8hi __builtin_arc_vld128 (const int, const int)
+@end smallexample
+
+@smallexample
+XIII)  Return type      :  void
+       First argument   :  v8hi
+       Second argument  :  const int
+       Third argument   :  const int
+
+	The second argument in these builtins has to be an unsigned 3-bit
+integer constant, as it indicates I0-I7 registers. The third argument has to be
+an unsigned 8-bit quantity The file arc-simd.h also profides defines which can
+be used in place of the I0-I7 registe numbers to facilitate better code readability:
+
+void __builtin_arc_vst128 (v8hi, const int, const int)
+void __builtin_arc_vst64 (v8hi, const int, const int)
+@end smallexample
+
+
+@smallexample
+XIV)  Return type      :  void
+      First argument   :  v8hi
+      Second argument  :  const int
+      Third argument   :  const int
+
+	The second argument has to be an unsigned 3-bit quantity to identify the
+16-bit subregister to be stored. The third argument in these builtins has to be
+an unsigned 3-bit integer constant, as it indicates I0-I7 registers. The fourth
+argument has to be an unsigned 8-bit quantity The file arc-simd.h also profides
+defines which can be used in place of the I0-I7 registe numbers to facilitate
+better code readability:
+
+void __builtin_arc_vst16_n (v8hi, const int, const int, const int)
+void __builtin_arc_vst32_n (v8hi, const int, const int, const int)
+@end smallexample
+
+
+@smallexample
+XIV)  Return type      :  void
+      First argument   :  const int
+
+	The argument has to be an unsigned 6-bit quantity.
+
+void __builtin_arc_vinti (const int)
+@end smallexample
+
+@smallexample
+NOTE: For all builtins __builtin_arc_<someinsn>, the header file arc-simd.h also
+      provides macros called _<someinsn> which can be used for programming ease
+      and improved readability. 
+
+     Besides these, the following extra defines and typedefs are also provided
+in the header file
+ 
+#define _setup_dma_in_channel_reg  _vdiwr
+#define _setup_dma_out_channel_reg _vdowr
+
+typedef int   __v4si  __attribute__((vector_size(16)));
+typedef short __v8hi  __attribute__((vector_size(16)));
+@end smallexample
+
 @node ARM iWMMXt Built-in Functions
 @subsection ARM iWMMXt Built-in Functions
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7e6da15515d..4efc8c2871c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -427,8 +427,33 @@ Objective-C and Objective-C++ Dialects}.
 
 @emph{ARC Options}
 @gccoptlist{-EB  -EL @gol
--mmangle-cpu  -mcpu=@var{cpu}  -mtext=@var{text-section} @gol
--mdata=@var{data-section}  -mrodata=@var{readonly-data-section}}
+-mbig-endian -mlittle-endian @gol
+-mA4  -mA5  -mA6 -mARC600 -mA7 -mARC700 -mmixed-code @gol
+-mtext=@var{text-section} -mdata=@var{data-section}   @gol
+-mrodata=@var{readonly-data-section} @gol
+-malign-loops -mno-align-loops @gol
+-mvolatile-cache -mno-volatile-cache @gol
+-mno-cond-exec @gol
+-mnorm @gol
+-mswap @gol
+-mbarrel_shifter @gol
+-mmul64 @gol
+-mmin_max @gol
+-mEA @gol
+-msoft-float @gol
+-mno-mpy @gol
+-mno-brcc @gol
+-mlong-calls @gol
+-mno-sdata @gol
+-mno-millicode @gol
+-mspfp @gol
+-mspfp_compact @gol
+-mspfp_fast @gol
+-mdpfp @gol
+-mdpfp_compact @gol
+-mdpfp_fast @gol
+-msimd @gol
+}
 
 @emph{ARM Options}
 @gccoptlist{-mapcs-frame  -mno-apcs-frame @gol
@@ -8684,44 +8709,162 @@ These options are defined for ARC implementations:
 @table @gcctabopt
 @item -EL
 @opindex EL
+@itemx -mlittle-endian
+@opindex mlittle-endian
 Compile code for little endian mode.  This is the default.
 
 @item -EB
 @opindex EB
+@itemx -mbig-endian
+@opindex mbig-endian
 Compile code for big endian mode.
 
-@item -mmangle-cpu
-@opindex mmangle-cpu
-Prepend the name of the cpu to all public symbol names.
-In multiple-processor systems, there are many ARC variants with different
-instruction and register set characteristics.  This flag prevents code
-compiled for one cpu to be linked with code compiled for another.
-No facility exists for handling variants that are ``almost identical''.
-This is an all or nothing option.
+@item -mA4
+@opindex mA4
+Generates code for ARCtangent-A4 processor. This is the default.
 
-@item -mcpu=@var{cpu}
-@opindex mcpu
-Compile code for ARC variant @var{cpu}.
-Which variants are supported depend on the configuration.
-All variants support @option{-mcpu=base}, this is the default.
+@item -mA5
+@opindex mA5
+Generates ARCompact 32-bit code for ARCtangent-A5 processor.
+
+@item -mA6
+@opindex mA6
+@itemx -mARC600
+@opindex mARC600
+Generates ARCompact 32-bit code for ARCtangent-ARC600 processor.
+
+@item -mA7
+@opindex mA7
+@itemx -mARC700
+@opindex mARC700
+Generates ARCompact 32-bit code for ARCtangent-ARC700 processor.
+
+@item -mmixed-code
+@opindex mmixed-code
+Generates ARCompact 16-bit instructions intermixed with 32-bit instructions
+for ARCtangent-A5 and higher processors.
 
 @item -mtext=@var{text-section}
 @itemx -mdata=@var{data-section}
 @itemx -mrodata=@var{readonly-data-section}
-@opindex mtext
-@opindex mdata
-@opindex mrodata
+@opindex mtext=@var{text-section}
+@opindex mdata=@var{data-section}
+@opindex mrodata=@var{readonly-data-section}
 Put functions, data, and readonly data in @var{text-section},
 @var{data-section}, and @var{readonly-data-section} respectively
 by default.  This can be overridden with the @code{section} attribute.
 @xref{Variable Attributes}.
 
-@item -mfix-cortex-m3-ldrd
-@opindex mfix-cortex-m3-ldrd
-Some Cortex-M3 cores can cause data corruption when @code{ldrd} instructions
-with overlapping destination and base registers are used.  This option avoids
-generating these instructions.  This option is enabled by default when
-@option{-mcpu=cortex-m3} is specified.
+@item -malign-loops
+@opindex malign-loops
+Align loop starts to 32-byte boundaries (cache line size).
+
+@item -malign-loops
+@opindex malign-loops
+Do not align loop starts to 32-byte boundaries (cache line size).
+
+@item -mvolatile-cache
+@opindex mvolatile-cache
+Allow caching of volatile references. This is the default.
+
+@item -mno-valatile-cache
+@opindex mno-volatile-cache
+Do not cache volatile references. 
+
+@item -mno-cond-exec
+@opindex mno-cond-exec
+Do not generate predicated instructions for conditional execution.
+
+@item -mnorm
+@opindex mnorm
+Allow generation of norm instruction through the use of builtins. For
+ARC700, the -mnorm option is turned on by default.
+
+@item -mswap
+@opindex mswap
+Allow generation of swap instruction through the use of builtins. For
+ARC700, the -mswap option is turned on by default.
+
+@item -mbarrel_shifter
+@opindex mbarrel_shifter
+Allow generation of multiple shift instruction supported by barrel
+shifter unit. For post A4 cores, such as A5, ARC600, ARC700, the
+-mbarrel_shifter option is turned on by default.
+
+@item -mmul64
+@opindex mmul64
+Allow generation of mul64 and mulu64 instructions, by using
+builtins. This option is not allowed for ARC700.
+
+@item -mmin_max
+@opindex mmin_max
+Allow generation of min and max instructions for A4. For post A4
+cores, these are generated by default.
+
+@item -mno-mpy
+@opindex mno-mpy
+Disallow generation of mpy mpyh, mpyhu, mpyu instructions for ARC700. This
+option is allowed only for ARC700 processor.
+
+@item -mEA
+@opindex mEA
+Allow generation of extended arithmetic instructions.
+
+@item -msoft-float
+@opindex msoft-float
+Dummy flag. Many applications use this flag generically, and soft-floats 
+are the only option on ARC.
+
+@item -mno-brcc
+@opindex mno-brcc
+Disable generation of BRcc instructions.
+
+@item -mlong-calls
+@opindex mlong-calls
+Make all function calls as register-indirect. This flag can be overridden 
+by using the @samp{short_call} function attribute.
+
+@item -mno-sdata
+@opindex mno-sdata
+Do not generate sdata references
+
+@item -mno-millicode
+@opindex mno-millicode
+Do not generate millicode thunk code for saving and restoring registers in 
+functions' prologue/epilogue. This flags is needed only with -Os, since millicode 
+thunks are used only when optimizing for size..
+
+@end table
+
+@subsection FPX Options
+@cindex ARC FPX Options
+These options can be used to generate code for the FPX (Floating Point
+eXtension) extension unit.
+
+@table @gcctabopt
+@item -mspfp
+@opindex mspfp
+@itemx -mspfp_compact
+@opindex mspfp_compact
+Generate Single Precision FPX (compact) instructions
+
+@item -mspfp_fast
+@opindex mspfp_fast
+Generate Single Precision FPX (fast) instructions
+
+@item -mdpfp
+@opindex mdpfp
+@itemx -mdpfp_compact
+@opindex mdpfp_compact
+Generate Double Precision FPX (compact) instructions
+
+@item -mdpfp_fast
+@opindex mdpfp_fast
+Generate Double Precision FPX (fast) instructions
+
+@item -msimd
+@opindex msimd
+Enable generation of ARC SIMD instructions via target-specific builtins.
 
 @end table
 
@@ -8733,6 +8876,13 @@ These @samp{-m} options are defined for Advanced RISC Machines (ARM)
 architectures:
 
 @table @gcctabopt
+@item -mfix-cortex-m3-ldrd
+@opindex mfix-cortex-m3-ldrd
+Some Cortex-M3 cores can cause data corruption when @code{ldrd} instructions
+with overlapping destination and base registers are used.  This option avoids
+generating these instructions.  This option is enabled by default when
+@option{-mcpu=cortex-m3} is specified.
+
 @item -mabi=@var{name}
 @opindex mabi
 Generate code for the specified ABI@.  Permissible values are: @samp{apcs-gnu},
diff --git a/gcc/doc/mxp.texi b/gcc/doc/mxp.texi
new file mode 100644
index 00000000000..69cef2657da
--- /dev/null
+++ b/gcc/doc/mxp.texi
@@ -0,0 +1,106 @@
+data/bss layout: uses different sections ordered by minimum addressing scale.
+no separate .rodata section(s).
+.data16: scaling factor 16
+.data8, .data4, data2, .data1: likewise for smaller scaling factors
+.bss1, .bss2, .bss4, .bss8, .bss16: bss sections for increasing scaling
+factors
+The data base pointer register i9 typically points at the place where .bss1
+ends and .data1 starts.  It might be moved up or down if allocation
+would otherwise overflow on one side, and on the other side is slack.
+
+Tasks to be done:
+- Convert this document into a proper texinfo file, incorporate it into
+  gcc ducumentation, and test 'make info'
+- binutils support for using undefined labels in mxp data/bss sections
+  as offsets in memory addresses.
+- binutils support for mxp code labels.  For a start, we are looking to
+  have a special text section where to put all the mxp code.  At link time,
+  this special text section is considered to be loaded at the start of the
+  SCM for purposes of resolving SCM absolute relocations.  However, the
+  code gets actually a different load address for the ARC700 core, and gets
+  a j_s [blink] instruction appended (extra points if you make this a j_s.d
+  [blink] before the last insn without the potential to break stuff...)
+  Later we will likely want to move to multiple of such special text sections
+  to handle overlays, and possibly also have different load addreses to
+  accomodate multiple overlays.  If we want to be able to handle SCM PIE,
+  I.e. code that can be loaded to varying SCM locations, the arc will need
+  to load an a core register with the SCM load address before calling the
+  SCQ loading code, and the latter will have to use add instructions to
+  calculate SCM locations on the fly.
+  No matter if we use such add instructions, or long immediates, instructions
+  that reference SCM memory locations work out as 64 bit of code on the
+  arc side, while the other SIMD instructions are injected with a single
+  32 bit code from the arc side.  Thus we have a discrepancy between the
+  space taken up by the instructions in the object file and the size we
+  have to consider for purposes of calculating SCM addresses.
+  Luckily, these differences are constant from the first time the SIMD
+  assembly is emitted.  Thus, the total number of instructions
+  with SCM references that precede an SCM label gives us the number of
+  32 bit words to subtract from the total number of preceding 32 bits words
+  to arrive at the offset from the SCM load address.
+  To account for preceding SCM references in the same module, we can make
+  the SCM label appear to be accordingly earlier in the module.
+  (This will have to be compensated for if we want to do any linktime
+   relaxation at some later point in time.)
+  We also need to keep a tally of the total number of SCM references in each
+  module.
+  When linking multiple modules together, the total of these tallies for all
+  preceding modules needs to be added up, and subtracted from the value of
+  each label.
+  Like SCM references, (other) long immediates bulk up the code on the arc
+  side while leaving the SIMD instruction count the same, so they have to
+  be tallied up together with the SCM references.
+- library functions:
+  - divsi3: use sh64 code as starting point.  Note that there is no
+    point in loading the table base address before the function call, because
+    all SCM memory addressing has an offset.
+    divv8hi3, divv4si3: use older sh64 code w/out lookup table as starting
+    point
+  - divhi3
+- Investigate register class preferencing issues.  Naming lane sets with
+  lane 0 first actually results in the wrong reg_class_subunions.  In theory
+  the ordierng should be something like 00, 10, 01, 30, 03, ff, to get the
+  sets with lane zero prefered for subunions.  preferred classes can be
+  seen in the *lreg dump file after compiling with -da.  Another avenue to
+  saner subunions is to add proper union lane sets 11, 33.
+  The paradoxical thing I am seeing here is that the instruction count for
+  muldi increases when I introduce these measures.
+  Another - or complimentary - approach is to shift the cost balance.
+  in theory REGISTER_MOVE_COST should have an influence, but in practice
+  I haven't seen any.  What works is adding extra cost to insn alternatives
+  which allow non-lane0 registers.  A problem here - and in general - is that
+  we want a viable alternate register class.  Jacking up the cost for
+  non-lane0 alternatives can disparage these to the point that we loose the
+  altclass.  We also have often altclasses that don't actually contain any
+  extra valid registers.  In theory increasing MEMORY_MOVE_COST can
+  compensate, however I see paradoxical outcomes when I try to make this
+  dependent on !(reload_in_progress || reload_completed).  I have a diff
+  for some of the changes I've tried in
+  /home/joernr/prefclass-experiments-20080428.
+  Maybe we ned to jackup REGISTER_MOVE_COST, MEMORY_MOVE_COST and RTX_COST
+  consistently to get a more fine-grained resolution of costs.
+- Obtain code samples of code that we think is suitable and relevant for
+  autovectorization.  E.g. some codec.
+  Dependent tasks:
+  - Identify the actual section of this code that we think we should be
+    able to autovectorize.
+  - Make sure autovectorization takes place.
+- Partitioning work.  Check with IBM Haifa and other Milepost partners
+  what they already have.
+  Inasmuch as not already done:
+  - Identify individual functions and subgraphs of the callgraph we can move
+    to the SIMD engine.
+  - Add code to tree loop analysis to break out loops that we can move to
+    the SIMD engine.
+  - Handle data sets that don't fit into SDM.  The simplest to implement
+    approach is probably to do loop tiling at the interface between arc core
+    and simd engine.  OTOH we can get much better parallelism if we hand
+    over the entire work to the simd engine and let it DMA out the previoud
+    block, and DMA in the next block, while it is performing calculations.
+    For this we need to represent main memory pointers.
+    Need not necessarilty be exposed as pointers to the mxp-gcc, we could
+    express the loop tiling with intrinsics.
+- Add doloop pattern
+- Convert multi-insn define_insn patterns into define_insn_and_split patterns.
+- Add scheduler description
+- Where missing, add comments to the code according to GNU coding standards.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 7dfb46b3a0d..5e9a2792337 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -2758,6 +2758,12 @@ Do not define this macro if you do not define
 is @code{BITS_PER_WORD} bits wide is correct for your machine.
 @end defmac
 
+@deftypefn {Target Hook} bool TARGET_PRESERVE_RELOAD_P (rtx @var{in})
+Called when doing an input reload using the value @var{in}.  Return true
+if the reload register should be available for inheritance later.  This
+might increase the spill pressure, but enhances reload inheritance.
+@end deftypefn
+
 @defmac SMALL_REGISTER_CLASSES
 On some machines, it is risky to let hard registers live across arbitrary
 insns.  Typically, these machines have instructions that require values
@@ -5962,6 +5968,13 @@ will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{MOVE_RATIO}.
 @end defmac
 
+@defmac CAN_MOVE_BY_PIECES (@var{size}, @var{alignment})
+A C expression used to determine whether a chunk of memory is to be copied
+in pieces either by @code{move_by_pieces}, or by a movmem expander.  This
+is used by other optimizers that want to anticipate how a block copy is
+going to be done.  If not defined, MOVE_BY_PIECES_P is used instead.
+@end defmac
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.