From 08cacc646cc30ce1b3b4b752bfd3920e443624e6 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@redhat.com>
Date: Fri, 31 Dec 2004 20:11:17 +0000
Subject:         * src/types.c (FFI_TYPE_POINTER): Define with sizeof.        
 (FFI_TYPE_LONGDOUBLE): Fix for ia64.         * src/ia64/ffitarget.h (struct
 ffi_ia64_trampoline_struct): Move         into ffi_prep_closure.         *
 src/ia64/ia64_flags.h, src/ia64/ffi.c, src/ia64/unix.S: Rewrite         from
 scratch.

git-svn-id: https://gcc.gnu.org/svn/gcc/trunk@92774 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libffi/ChangeLog             |   9 +
 libffi/src/ia64/ffi.c        | 947 +++++++++++++++++++------------------------
 libffi/src/ia64/ffitarget.h  |   9 -
 libffi/src/ia64/ia64_flags.h |  47 +--
 libffi/src/ia64/unix.S       | 781 +++++++++++++++++++++++------------
 libffi/src/types.c           |  21 +-
 6 files changed, 957 insertions(+), 857 deletions(-)

(limited to 'libffi')

diff --git a/libffi/ChangeLog b/libffi/ChangeLog
index 20c39cb62a7..a55735efcda 100644
--- a/libffi/ChangeLog
+++ b/libffi/ChangeLog
@@ -1,3 +1,12 @@
+2004-12-31  Richard Henderson  <rth@redhat.com>
+
+	* src/types.c (FFI_TYPE_POINTER): Define with sizeof.
+	(FFI_TYPE_LONGDOUBLE): Fix for ia64.
+	* src/ia64/ffitarget.h (struct ffi_ia64_trampoline_struct): Move 
+	into ffi_prep_closure.
+	* src/ia64/ia64_flags.h, src/ia64/ffi.c, src/ia64/unix.S: Rewrite
+	from scratch.
+
 2004-12-27  Richard Henderson  <rth@redhat.com>
 
 	* src/x86/unix64.S: Fix typo in unwind info.
diff --git a/libffi/src/ia64/ffi.c b/libffi/src/ia64/ffi.c
index 1dc27dbce4f..e810827a81d 100644
--- a/libffi/src/ia64/ffi.c
+++ b/libffi/src/ia64/ffi.c
@@ -29,622 +29,365 @@
 
 #include <stdlib.h>
 #include <stdbool.h>
+#include <float.h>
 
 #include "ia64_flags.h"
 
-/* Memory image of fp register contents.  Should eventually be an fp 	*/
-/* type long enough to hold an entire register.  For now we use double.	*/
-typedef double float80;
-
-/* The stack layout at call to ffi_prep_args.  Other_args will remain	*/
-/* on the stack for the actual call.  Everything else we be transferred	*/
-/* to registers and popped by the assembly code.			*/
-
-struct ia64_args {
-    long scratch[2];	/* Two scratch words at top of stack.		*/
-			/* Allows sp to be passed as arg pointer.	*/
-    void * r8_contents;	/* Value to be passed in r8			*/
-    long spare;		/* Not used.					*/
-    float80 fp_regs[8]; /* Contents of 8 floating point argument 	*/
-			/* registers.					*/
-    long out_regs[8];	/* Contents of the 8 out registers used 	*/
-			/* for integer parameters.			*/
-    long other_args[0]; /* Arguments passed on stack, variable size	*/
-			/* Treated as continuation of out_regs.		*/
+/* A 64-bit pointer value.  In LP64 mode, this is effectively a plain
+   pointer.  In ILP32 mode, it's a pointer that's been extended to 
+   64 bits by "addp4".  */
+typedef void *PTR64 __attribute__((mode(DI)));
+
+/* Memory image of fp register contents.  This is the implementation
+   specific format used by ldf.fill/stf.spill.  All we care about is
+   that it wants a 16 byte aligned slot.  */
+typedef struct
+{
+  UINT64 x[2] __attribute__((aligned(16)));
+} fpreg;
+
+
+/* The stack layout given to ffi_call_unix and ffi_closure_unix_inner.  */
+
+struct ia64_args
+{
+  fpreg fp_regs[8];	/* Contents of 8 fp arg registers.  */
+  UINT64 gp_regs[8];	/* Contents of 8 gp arg registers.  */
+  UINT64 other_args[];	/* Arguments passed on stack, variable size.  */
 };
 
-static size_t float_type_size(unsigned short tp)
+
+/* Adjust ADDR, a pointer to an 8 byte slot, to point to the low LEN bytes.  */
+
+static inline void *
+endian_adjust (void *addr, size_t len)
 {
-  switch(tp) {
-    case FFI_TYPE_FLOAT:
-      return sizeof(float);
-    case FFI_TYPE_DOUBLE:
-      return sizeof(double);
-#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof(long double);
+#ifdef __BIG_ENDIAN__
+  return addr + (8 - len);
+#else
+  return addr;
 #endif
-    default:
-      FFI_ASSERT(0);
-  }
 }
 
-/*
- * Is type a struct containing at most n floats, doubles, or extended
- * doubles, all of the same fp type?
- * If so, set *element_type to the fp type.
- */
-static bool is_homogeneous_fp_aggregate(ffi_type * type, int n,
-				        unsigned short * element_type)
+/* Store VALUE to ADDR in the current cpu implementation's fp spill format.  */
+
+static inline void
+stf_spill(fpreg *addr, __float80 value)
 {
-  ffi_type **ptr; 
-  unsigned short element, struct_element;
+  asm ("stf.spill %0 = %1%P0" : "=m" (*addr) : "f"(value));
+}
+
+/* Load a value from ADDR, which is in the current cpu implementation's
+   fp spill format.  */
 
-  int type_set = 0;
+static inline __float80
+ldf_fill(fpreg *addr)
+{
+  __float80 ret;
+  asm ("ldf.fill %0 = %1%P1" : "=f"(ret) : "m"(*addr));
+  return ret;
+}
 
-  FFI_ASSERT(type != NULL);
+/* Return the size of the C type associated with with TYPE.  Which will
+   be one of the FFI_IA64_TYPE_HFA_* values.  */
 
-  FFI_ASSERT(type->elements != NULL);
+static size_t
+hfa_type_size (int type)
+{
+  switch (type)
+    {
+    case FFI_IA64_TYPE_HFA_FLOAT:
+      return sizeof(float);
+    case FFI_IA64_TYPE_HFA_DOUBLE:
+      return sizeof(double);
+    case FFI_IA64_TYPE_HFA_LDOUBLE:
+      return sizeof(__float80);
+    default:
+      abort ();
+    }
+}
 
-  ptr = &(type->elements[0]);
+/* Load from ADDR a value indicated by TYPE.  Which will be one of
+   the FFI_IA64_TYPE_HFA_* values.  */
 
-  while ((*ptr) != NULL)
+static __float80
+hfa_type_load (int type, void *addr)
+{
+  switch (type)
     {
-      switch((*ptr) -> type) {
-	case FFI_TYPE_FLOAT:
-	  if (type_set && element != FFI_TYPE_FLOAT) return 0;
-	  if (--n < 0) return false;
-	  type_set = 1;
-	  element = FFI_TYPE_FLOAT;
-	  break;
-	case FFI_TYPE_DOUBLE:
-	  if (type_set && element != FFI_TYPE_DOUBLE) return 0;
-	  if (--n < 0) return false;
-	  type_set = 1;
-	  element = FFI_TYPE_DOUBLE;
-	  break;
-	case FFI_TYPE_STRUCT:
-	  if (!is_homogeneous_fp_aggregate(type, n, &struct_element))
-	      return false;
-	  if (type_set && struct_element != element) return false;
-	  n -= (type -> size)/float_type_size(element);
-	  element = struct_element;
-	  if (n < 0) return false;
-	  break;
-	/* case FFI_TYPE_LONGDOUBLE:
-	  Not yet implemented.	*/
-	default:
-	  return false;
-      }
-      ptr++;
+    case FFI_IA64_TYPE_HFA_FLOAT:
+      return *(float *) addr;
+    case FFI_IA64_TYPE_HFA_DOUBLE:
+      return *(double *) addr;
+    case FFI_IA64_TYPE_HFA_LDOUBLE:
+      return *(__float80 *) addr;
+    default:
+      abort ();
     }
-  *element_type = element;
-  return true;
-   
-} 
+}
 
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments.  It fills in
-   the arguments in the structure referenced by stack. Returns nonzero
-   if fp registers are used for arguments. */
+/* Load VALUE into ADDR as indicated by TYPE.  Which will be one of
+   the FFI_IA64_TYPE_HFA_* values.  */
 
-static bool
-ffi_prep_args(struct ia64_args *stack, extended_cif *ecif, int bytes)
+static void
+hfa_type_store (int type, void *addr, __float80 value)
 {
-  register long i, avn;
-  register void **p_argv;
-  register long *argp = stack -> out_regs;
-  register float80 *fp_argp = stack -> fp_regs;
-  register ffi_type **p_arg;
-
-  /* For big return structs, r8 needs to contain the target address.	*/
-  /* Since r8 is otherwise dead, we set it unconditionally.		*/
-  stack -> r8_contents = ecif -> rvalue;
-  i = 0;
-  avn = ecif->cif->nargs;
-  p_arg = ecif->cif->arg_types;
-  p_argv = ecif->avalue;
-  while (i < avn)
+  switch (type)
     {
-      size_t z; /* z is in units of arg slots or words, not bytes.	*/
+    case FFI_IA64_TYPE_HFA_FLOAT:
+      *(float *) addr = value;
+      break;
+    case FFI_IA64_TYPE_HFA_DOUBLE:
+      *(double *) addr = value;
+      break;
+    case FFI_IA64_TYPE_HFA_LDOUBLE:
+      *(__float80 *) addr = value;
+      break;
+    default:
+      abort ();
+    }
+}
 
-      switch ((*p_arg)->type)
-	{
-	case FFI_TYPE_SINT8:
-	  z = 1;
-	  *(SINT64 *) argp = *(SINT8 *)(* p_argv);
-	  break;
-		  
-	case FFI_TYPE_UINT8:
-	  z = 1;
-	  *(UINT64 *) argp = *(UINT8 *)(* p_argv);
-	  break;
-		  
-	case FFI_TYPE_SINT16:
-	  z = 1;
-	  *(SINT64 *) argp = *(SINT16 *)(* p_argv);
-	  break;
-		  
-	case FFI_TYPE_UINT16:
-	  z = 1;
-	  *(UINT64 *) argp = *(UINT16 *)(* p_argv);
-	  break;
-		  
-	case FFI_TYPE_SINT32:
-	  z = 1;
-	  *(SINT64 *) argp = *(SINT32 *)(* p_argv);
-	  break;
-		  
-	case FFI_TYPE_UINT32:
-	  z = 1;
-	  *(UINT64 *) argp = *(UINT32 *)(* p_argv);
-	  break;
+/* Is TYPE a struct containing floats, doubles, or extended doubles,
+   all of the same fp type?  If so, return the element type.  Return
+   FFI_TYPE_VOID if not.  */
 
-	case FFI_TYPE_SINT64:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_POINTER:
-	  z = 1;
-	  *(UINT64 *) argp = *(UINT64 *)(* p_argv);
-	  break;
+static int
+hfa_element_type (ffi_type *type, int nested)
+{
+  int element = FFI_TYPE_VOID;
 
-	case FFI_TYPE_FLOAT:
-	  z = 1;
-	  if (fp_argp - stack->fp_regs < 8)
-	    {
-	      /* Note the conversion -- all the fp regs are loaded as
-		 doubles.  */
-	      *fp_argp++ = *(float *)(* p_argv);
-	    }
-	  /* Also put it into the integer registers or memory: */
-	  *(UINT64 *) argp = *(UINT32 *)(* p_argv);
-	  break;
+  switch (type->type)
+    {
+    case FFI_TYPE_FLOAT:
+      /* We want to return VOID for raw floating-point types, but the
+	 synthetic HFA type if we're nested within an aggregate.  */
+      if (nested)
+	element = FFI_IA64_TYPE_HFA_FLOAT;
+      break;
 
-	case FFI_TYPE_DOUBLE:
-	  z = 1;
-	  if (fp_argp - stack->fp_regs < 8)
-	    *fp_argp++ = *(double *)(* p_argv);
-	  /* Also put it into the integer registers or memory: */
-	  *(double *) argp = *(double *)(* p_argv);
-	  break;
+    case FFI_TYPE_DOUBLE:
+      /* Similarly.  */
+      if (nested)
+	element = FFI_IA64_TYPE_HFA_DOUBLE;
+      break;
 
-	case FFI_TYPE_STRUCT:
+    case FFI_TYPE_LONGDOUBLE:
+      /* Similarly, except that that HFA is true for double extended,
+	 but not quad precision.  Both have sizeof == 16, so tell the
+	 difference based on the precision.  */
+      if (LDBL_MANT_DIG == 64 && nested)
+	element = FFI_IA64_TYPE_HFA_LDOUBLE;
+      break;
+
+    case FFI_TYPE_STRUCT:
+      {
+	ffi_type **ptr = &type->elements[0];
+
+	for (ptr = &type->elements[0]; *ptr ; ptr++)
 	  {
-	      size_t sz = (*p_arg)->size;
-	      unsigned short element_type;
-              z = ((*p_arg)->size + FFI_SIZEOF_ARG - 1)/FFI_SIZEOF_ARG;
-	      if (is_homogeneous_fp_aggregate(*p_arg, 8, &element_type)) {
-		int i;
-		int nelements = sz/float_type_size(element_type);
-		for (i = 0; i < nelements; ++i) {
-		  switch (element_type) {
-		    case FFI_TYPE_FLOAT:
-		      if (fp_argp - stack->fp_regs < 8)
-			*fp_argp++ = ((float *)(* p_argv))[i];
-		      break;
-		    case FFI_TYPE_DOUBLE:
-		      if (fp_argp - stack->fp_regs < 8)
-			*fp_argp++ = ((double *)(* p_argv))[i];
-		      break;
-		    default:
-			/* Extended precision not yet implemented. */
-			abort();
-		  }
-		}
-	      }
-	      /* And pass it in integer registers as a struct, with	*/
-	      /* its actual field sizes packed into registers.		*/
-	      memcpy(argp, *p_argv, (*p_arg)->size);
+	    int sub_element = hfa_element_type (*ptr, 1);
+	    if (sub_element == FFI_TYPE_VOID)
+	      return FFI_TYPE_VOID;
+
+	    if (element == FFI_TYPE_VOID)
+	      element = sub_element;
+	    else if (element != sub_element)
+	      return FFI_TYPE_VOID;
 	  }
-	  break;
-
-	default:
-	  FFI_ASSERT(0);
-	}
+      }
+      break;
 
-      argp += z;
-      i++, p_arg++, p_argv++;
+    default:
+      return FFI_TYPE_VOID;
     }
-  return (fp_argp != stack -> fp_regs);
+
+  return element;
 }
 
-/* Perform machine dependent cif processing */
+
+/* Perform machine dependent cif processing. */
+
 ffi_status
 ffi_prep_cif_machdep(ffi_cif *cif)
 {
-  long i, avn;
-  bool is_simple = true;
-  long simple_flag = FFI_SIMPLE_V;
-  /* Adjust cif->bytes to include space for the 2 scratch words,
-     r8 register contents, spare word,
-     the 8 fp register contents, and all 8 integer register contents.
-     This will be removed before the call, though 2 scratch words must
-     remain.  */
-
-  cif->bytes += 4*sizeof(long) + 8 *sizeof(float80);
+  int flags;
+
+  /* Adjust cif->bytes to include space for the bits of the ia64_args frame
+     that preceeds the integer register portion.  The estimate that the 
+     generic bits did for the argument space required is good enough for the
+     integer component.  */
+  cif->bytes += offsetof(struct ia64_args, gp_regs[0]);
   if (cif->bytes < sizeof(struct ia64_args))
     cif->bytes = sizeof(struct ia64_args);
 
-  /* The stack must be double word aligned, so round bytes up
-     appropriately. */
-
-  cif->bytes = ALIGN(cif->bytes, 2*sizeof(void*));
-
-  avn = cif->nargs;
-  if (avn <= 2) {
-    for (i = 0; i < avn; ++i) {
-      switch(cif -> arg_types[i] -> type) {
-	case FFI_TYPE_SINT32:
-	  simple_flag = FFI_ADD_INT_ARG(simple_flag);
-	  break;
-	case FFI_TYPE_SINT64:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_POINTER:
-	  simple_flag = FFI_ADD_LONG_ARG(simple_flag);
-	  break;
-	default:
-	  is_simple = false;
-      }
-    }
-  } else {
-    is_simple = false;
-  }
-
-  /* Set the return type flag */
+  /* Set the return type flag. */
+  flags = cif->rtype->type;
   switch (cif->rtype->type)
     {
-    case FFI_TYPE_VOID:
-      cif->flags = FFI_TYPE_VOID;
+    case FFI_TYPE_LONGDOUBLE:
+      /* Leave FFI_TYPE_LONGDOUBLE as meaning double extended precision,
+	 and encode quad precision as a two-word integer structure.  */
+      if (LDBL_MANT_DIG != 64)
+	flags = FFI_IA64_TYPE_SMALL_STRUCT | (16 << 8);
       break;
 
     case FFI_TYPE_STRUCT:
       {
-        size_t sz = cif -> rtype -> size;
-  	unsigned short element_type;
-
-	is_simple = false;
-  	if (is_homogeneous_fp_aggregate(cif -> rtype, 8, &element_type)) {
-	  int nelements = sz/float_type_size(element_type);
-	  if (nelements <= 1) {
-	    if (0 == nelements) {
-	      cif -> flags = FFI_TYPE_VOID;
-	    } else {
-	      cif -> flags = element_type;
-	    }
-	  } else {
-	    switch(element_type) {
-	      case FFI_TYPE_FLOAT:
-	        cif -> flags = FFI_IS_FLOAT_FP_AGGREGATE | nelements;
-		break;
-	      case FFI_TYPE_DOUBLE:
-	        cif -> flags = FFI_IS_DOUBLE_FP_AGGREGATE | nelements;
-		break;
-	      default:
-		/* long double NYI */
-		abort();
-	    }
+        size_t size = cif->rtype->size;
+  	int hfa_type = hfa_element_type (cif->rtype, 0);
+
+	if (hfa_type != FFI_TYPE_VOID)
+	  {
+	    size_t nelts = size / hfa_type_size (hfa_type);
+	    if (nelts <= 8)
+	      flags = hfa_type | (size << 8);
 	  }
-	  break;
-        }
-        if (sz <= 32) {
-	  if (sz <= 8) {
-              cif->flags = FFI_TYPE_INT;
-  	  } else if (sz <= 16) {
-              cif->flags = FFI_IS_SMALL_STRUCT2;
-  	  } else if (sz <= 24) {
-              cif->flags = FFI_IS_SMALL_STRUCT3;
-	  } else {
-              cif->flags = FFI_IS_SMALL_STRUCT4;
+	else
+	  {
+	    if (size <= 32)
+	      flags = FFI_IA64_TYPE_SMALL_STRUCT | (size << 8);
 	  }
-        } else {
-          cif->flags = FFI_TYPE_STRUCT;
-	}
       }
       break;
 
-    case FFI_TYPE_FLOAT:
-      is_simple = false;
-      cif->flags = FFI_TYPE_FLOAT;
-      break;
-
-    case FFI_TYPE_DOUBLE:
-      is_simple = false;
-      cif->flags = FFI_TYPE_DOUBLE;
-      break;
-
     default:
-      cif->flags = FFI_TYPE_INT;
-      /* This seems to depend on little endian mode, and the fact that	*/
-      /* the return pointer always points to at least 8 bytes.  But 	*/
-      /* that also seems to be true for other platforms.		*/
       break;
     }
-  
-  if (is_simple) cif -> flags |= simple_flag;
+  cif->flags = flags;
+
   return FFI_OK;
 }
 
-extern int ffi_call_unix(bool (*)(struct ia64_args *, extended_cif *, int), 
-			 extended_cif *, unsigned, 
-			 unsigned, unsigned *, void (*)());
+extern int ffi_call_unix (struct ia64_args *, PTR64, void (*)(), UINT64);
 
 void
 ffi_call(ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
 {
-  extended_cif ecif;
-  long simple = cif -> flags & FFI_SIMPLE;
-
-  /* Should this also check for Unix ABI? */
-  /* This is almost, but not quite, machine independent.  Note that	*/
-  /* we can get away with not caring about length of the result because	*/
-  /* we assume we are little endian, and the result buffer is large 	*/
-  /* enough.								*/
-  /* This needs work for HP/UX.						*/
-  if (simple) {
-    long (*lfn)() = (long (*)())fn;
-    long result;
-    switch(simple) {
-      case FFI_SIMPLE_V:
-	result = lfn();
-	break;
-      case FFI_SIMPLE_I:
-	result = lfn(*(int *)avalue[0]);
-	break;
-      case FFI_SIMPLE_L:
-	result = lfn(*(long *)avalue[0]);
-	break;
-      case FFI_SIMPLE_II:
-	result = lfn(*(int *)avalue[0], *(int *)avalue[1]);
-	break;
-      case FFI_SIMPLE_IL:
-	result = lfn(*(int *)avalue[0], *(long *)avalue[1]);
-	break;
-      case FFI_SIMPLE_LI:
-	result = lfn(*(long *)avalue[0], *(int *)avalue[1]);
-	break;
-      case FFI_SIMPLE_LL:
-	result = lfn(*(long *)avalue[0], *(long *)avalue[1]);
-	break;
-    }
-    if ((cif->flags & ~FFI_SIMPLE) != FFI_TYPE_VOID && 0 != rvalue) {
-      * (long *)rvalue = result;
-    }
-    return;
-  }
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return
-     value address then we need to make one.  */
-  
-  if (rvalue == NULL && cif->rtype->type == FFI_TYPE_STRUCT)
-    ecif.rvalue = alloca(cif->rtype->size);
-  else
-    ecif.rvalue = rvalue;
-    
-  switch (cif->abi) 
-    {
-    case FFI_UNIX:
-      ffi_call_unix(ffi_prep_args, &ecif, cif->bytes,
-		    cif->flags, rvalue, fn);
-      break;
+  struct ia64_args *stack;
+  long i, avn, gpcount, fpcount;
+  ffi_type **p_arg;
 
-    default:
-      FFI_ASSERT(0);
-      break;
-    }
-}
-
-/*
- * Closures represent a pair consisting of a function pointer, and
- * some user data.  A closure is invoked by reinterpreting the closure
- * as a function pointer, and branching to it.  Thus we can make an
- * interpreted function callable as a C function:  We turn the interpreter
- * itself, together with a pointer specifying the interpreted procedure,
- * into a closure.
- * On X86, the first few words of the closure structure actually contain code,
- * which will do the right thing.  On most other architectures, this
- * would raise some Icache/Dcache coherence issues (which can be solved, but
- * often not cheaply).
- * For IA64, function pointer are already pairs consisting of a code
- * pointer, and a gp pointer.  The latter is needed to access global variables.
- * Here we set up such a pair as the first two words of the closure (in
- * the "trampoline" area), but we replace the gp pointer with a pointer
- * to the closure itself.  We also add the real gp pointer to the
- * closure.  This allows the function entry code to both retrieve the
- * user data, and to restire the correct gp pointer.
- */
-
-static void 
-ffi_prep_incoming_args_UNIX(struct ia64_args *args, void **rvalue,
-			    void **avalue, ffi_cif *cif);
-
-/* This function is entered with the doctored gp (r1) value.
- * This code is extremely gcc specific.  There is some argument that
- * it should really be written in assembly code, since it depends on
- * gcc properties that might change over time.
- */
-
-/* ffi_closure_UNIX is an assembly routine, which copies the register 	*/
-/* state into a struct ia64_args, and then invokes			*/
-/* ffi_closure_UNIX_inner.  It also recovers the closure pointer	*/
-/* from its fake gp pointer.						*/
-void ffi_closure_UNIX();
-
-#ifndef __GNUC__
-#   error This requires gcc
-#endif
-void
-ffi_closure_UNIX_inner (ffi_closure *closure, struct ia64_args * args)
-/* Hopefully declaring this as a varargs function will force all args	*/
-/* to memory.								*/
-{
-  // this is our return value storage
-  long double    res;
-
-  // our various things...
-  ffi_cif       *cif;
-  unsigned short rtype;
-  void          *resp;
-  void		**arg_area;
-
-  resp = (void*)&res;
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will re-set RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_UNIX(args, (void**)&resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  rtype = cif->flags;
-
-  /* now, do a generic return based on the value of rtype */
-  if (rtype == FFI_TYPE_INT)
-    {
-      asm volatile ("ld8 r8=[%0]" : : "r" (resp) : "r8");
-    }
-  else if (rtype == FFI_TYPE_FLOAT)
-    {
-      asm volatile ("ldfs f8=[%0]" : : "r" (resp) : "f8");
-    }
-  else if (rtype == FFI_TYPE_DOUBLE)
-    {
-      asm volatile ("ldfd f8=[%0]" : : "r" (resp) : "f8");
-    }
-  else if (rtype == FFI_IS_SMALL_STRUCT2)
-    {
-      asm volatile ("ld8 r8=[%0]; ld8 r9=[%1]"
-		    : : "r" (resp), "r" (resp+8) : "r8","r9");
-    }
-  else if (rtype == FFI_IS_SMALL_STRUCT3)
-    {
-      asm volatile ("ld8 r8=[%0]; ld8 r9=[%1]; ld8 r10=[%2]"
-		    : : "r" (resp), "r" (resp+8), "r" (resp+16)
-		    : "r8","r9","r10");
-    }
-  else if (rtype == FFI_IS_SMALL_STRUCT4)
-    {
-      asm volatile ("ld8 r8=[%0]; ld8 r9=[%1]; ld8 r10=[%2]; ld8 r11=[%3]"
-		    : : "r" (resp), "r" (resp+8), "r" (resp+16), "r" (resp+24)
-		    : "r8","r9","r10","r11");
-    }
-  else if (rtype != FFI_TYPE_VOID && rtype != FFI_TYPE_STRUCT)
-    {
-      /* Can only happen for homogeneous FP aggregates?	*/
-      abort();
-    }
-}
+  FFI_ASSERT (cif->abi == FFI_UNIX);
 
-static void 
-ffi_prep_incoming_args_UNIX(struct ia64_args *args, void **rvalue,
-			    void **avalue, ffi_cif *cif)
-{
-  register unsigned int i;
-  register unsigned int avn;
-  register void **p_argv;
-  register long *argp = args -> out_regs;
-  unsigned fp_reg_num = 0;
-  register ffi_type **p_arg;
+  /* If we have no spot for a return value, make one.  */
+  if (rvalue == NULL && cif->rtype->type != FFI_TYPE_VOID)
+    rvalue = alloca (cif->rtype->size);
+    
+  /* Allocate the stack frame.  */
+  stack = alloca (cif->bytes);
 
+  gpcount = fpcount = 0;
   avn = cif->nargs;
-  p_argv = avalue;
-
-  for (i = cif->nargs, p_arg = cif->arg_types; i != 0; i--, p_arg++)
+  for (i = 0, p_arg = cif->arg_types; i < avn; i++, p_arg++)
     {
-      size_t z; /* In units of words or argument slots.	*/
-
       switch ((*p_arg)->type)
 	{
 	case FFI_TYPE_SINT8:
+	  stack->gp_regs[gpcount++] = *(SINT8 *)avalue[i];
+	  break;
 	case FFI_TYPE_UINT8:
+	  stack->gp_regs[gpcount++] = *(UINT8 *)avalue[i];
+	  break;
 	case FFI_TYPE_SINT16:
+	  stack->gp_regs[gpcount++] = *(SINT16 *)avalue[i];
+	  break;
 	case FFI_TYPE_UINT16:
+	  stack->gp_regs[gpcount++] = *(UINT16 *)avalue[i];
+	  break;
 	case FFI_TYPE_SINT32:
+	  stack->gp_regs[gpcount++] = *(SINT32 *)avalue[i];
+	  break;
 	case FFI_TYPE_UINT32:
+	  stack->gp_regs[gpcount++] = *(UINT32 *)avalue[i];
+	  break;
 	case FFI_TYPE_SINT64:
 	case FFI_TYPE_UINT64:
+	  stack->gp_regs[gpcount++] = *(UINT64 *)avalue[i];
+	  break;
+
 	case FFI_TYPE_POINTER:
-	  z = 1;
-	  *p_argv = (void *)argp;
+	  stack->gp_regs[gpcount++] = (UINT64)(PTR64) *(void **)avalue[i];
 	  break;
-		  
+
 	case FFI_TYPE_FLOAT:
-	  z = 1;
-	  /* Convert argument back to float in place from the saved value */
-	  if (argp - args->out_regs < 8 && fp_reg_num < 8) {
-	      *(float *)argp = args -> fp_regs[fp_reg_num++];
-	  }
-	  *p_argv = (void *)argp;
+	  if (gpcount < 8 && fpcount < 8)
+	    stf_spill (&stack->fp_regs[fpcount++], *(float *)avalue[i]);
+	  stack->gp_regs[gpcount++] = *(UINT32 *)avalue[i];
 	  break;
 
 	case FFI_TYPE_DOUBLE:
-	  z = 1;
-	  if (argp - args->out_regs < 8 && fp_reg_num < 8) {
-	      *p_argv = args -> fp_regs + fp_reg_num++;
-	  } else {
-	      *p_argv = (void *)argp;
-	  }
+	  if (gpcount < 8 && fpcount < 8)
+	    stf_spill (&stack->fp_regs[fpcount++], *(double *)avalue[i]);
+	  stack->gp_regs[gpcount++] = *(UINT64 *)avalue[i];
+	  break;
+
+	case FFI_TYPE_LONGDOUBLE:
+	  if (gpcount & 1)
+	    gpcount++;
+	  if (LDBL_MANT_DIG == 64 && gpcount < 8 && fpcount < 8)
+	    stf_spill (&stack->fp_regs[fpcount++], *(__float80 *)avalue[i]);
+	  memcpy (&stack->gp_regs[gpcount], avalue[i], 16);
+	  gpcount += 2;
 	  break;
 
 	case FFI_TYPE_STRUCT:
 	  {
-	      size_t sz = (*p_arg)->size;
-	      unsigned short element_type;
-              z = ((*p_arg)->size + FFI_SIZEOF_ARG - 1)/FFI_SIZEOF_ARG;
-	      if (argp - args->out_regs < 8
-		  && is_homogeneous_fp_aggregate(*p_arg, 8, &element_type)) {
-		int nelements = sz/float_type_size(element_type);
-		if (nelements + fp_reg_num >= 8) {
-		  /* hard case NYI.	*/
-		  abort();
-		}
-		if (element_type == FFI_TYPE_DOUBLE) {
-	          *p_argv = args -> fp_regs + fp_reg_num;
-		  fp_reg_num += nelements;
-		  break;
-		}
-		if (element_type == FFI_TYPE_FLOAT) {
-		  int j;
-		  for (j = 0; j < nelements; ++ j) {
-		     ((float *)argp)[j] = args -> fp_regs[fp_reg_num + j];
+	    size_t size = (*p_arg)->size;
+	    size_t align = (*p_arg)->alignment;
+	    int hfa_type = hfa_element_type (*p_arg, 0);
+
+	    FFI_ASSERT (align <= 16);
+	    if (align == 16 && (gpcount & 1))
+	      gpcount++;
+
+	    if (hfa_type != FFI_TYPE_VOID)
+	      {
+		size_t hfa_size = hfa_type_size (hfa_type);
+		size_t offset = 0;
+		size_t gp_offset = gpcount * 8;
+
+		while (fpcount < 8
+		       && offset < size
+		       && gp_offset < 8 * 8)
+		  {
+		    stf_spill (&stack->fp_regs[fpcount],
+			       hfa_type_load (hfa_type, avalue[i] + offset));
+		    offset += hfa_size;
+		    gp_offset += hfa_size;
+		    fpcount += 1;
 		  }
-	          *p_argv = (void *)argp;
-		  fp_reg_num += nelements;
-		  break;
-		}
-		abort();  /* Other fp types NYI */
 	      }
+
+	    memcpy (&stack->gp_regs[gpcount], avalue[i], size);
+	    gpcount += (size + 7) / 8;
 	  }
 	  break;
 
 	default:
-	  FFI_ASSERT(0);
+	  abort ();
 	}
-
-      argp += z;
-      p_argv++;
-
     }
-  
-  return;
+
+  ffi_call_unix (stack, rvalue, fn, cif->flags);
 }
 
+/* Closures represent a pair consisting of a function pointer, and
+   some user data.  A closure is invoked by reinterpreting the closure
+   as a function pointer, and branching to it.  Thus we can make an
+   interpreted function callable as a C function: We turn the
+   interpreter itself, together with a pointer specifying the
+   interpreted procedure, into a closure.
 
-/* Fill in a closure to refer to the specified fun and user_data.	*/
-/* cif specifies the argument and result types for fun.			*/
-/* the cif must already be prep'ed */
+   For IA64, function pointer are already pairs consisting of a code
+   pointer, and a gp pointer.  The latter is needed to access global
+   variables.  Here we set up such a pair as the first two words of
+   the closure (in the "trampoline" area), but we replace the gp
+   pointer with a pointer to the closure itself.  We also add the real
+   gp pointer to the closure.  This allows the function entry code to
+   both retrieve the user data, and to restire the correct gp pointer.  */
 
-/* The layout of a function descriptor.  A C function pointer really 	*/
-/* points to one of these.						*/
-typedef struct ia64_fd_struct {
-    void *code_pointer;
-    void *gp;
-} ia64_fd;
+extern void ffi_closure_unix ();
 
 ffi_status
 ffi_prep_closure (ffi_closure* closure,
@@ -652,20 +395,168 @@ ffi_prep_closure (ffi_closure* closure,
 		  void (*fun)(ffi_cif*,void*,void**,void*),
 		  void *user_data)
 {
-  struct ffi_ia64_trampoline_struct *tramp =
-    (struct ffi_ia64_trampoline_struct *) (closure -> tramp);
-  ia64_fd *fd = (ia64_fd *)(void *)ffi_closure_UNIX;
+  /* The layout of a function descriptor.  A C function pointer really 
+     points to one of these.  */
+  struct ia64_fd
+  {
+    UINT64 code_pointer;
+    UINT64 gp;
+  };
+
+  struct ffi_ia64_trampoline_struct
+  {
+    UINT64 code_pointer;	/* Pointer to ffi_closure_unix.  */
+    UINT64 fake_gp;		/* Pointer to closure, installed as gp.  */
+    UINT64 real_gp;		/* Real gp value.  */
+  };
+
+  struct ffi_ia64_trampoline_struct *tramp;
+  struct ia64_fd *fd;
 
   FFI_ASSERT (cif->abi == FFI_UNIX);
 
-  tramp -> code_pointer = fd -> code_pointer;
-  tramp -> real_gp = fd -> gp;
-  tramp -> fake_gp = closure;
-  closure->cif  = cif;
+  tramp = (struct ffi_ia64_trampoline_struct *)closure->tramp;
+  fd = (struct ia64_fd *)(void *)ffi_closure_unix;
+
+  tramp->code_pointer = fd->code_pointer;
+  tramp->real_gp = fd->gp;
+  tramp->fake_gp = (UINT64)(PTR64)closure;
+  closure->cif = cif;
   closure->user_data = user_data;
-  closure->fun  = fun;
+  closure->fun = fun;
 
   return FFI_OK;
 }
 
 
+UINT64
+ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
+			void *rvalue, void *r8)
+{
+  ffi_cif *cif;
+  void **avalue;
+  ffi_type **p_arg;
+  long i, avn, gpcount, fpcount;
+
+  cif = closure->cif;
+  avn = cif->nargs;
+  avalue = alloca (avn * sizeof (void *));
+
+  /* If the structure return value is passed in memory get that location
+     from r8 so as to pass the value directly back to the caller.  */
+  if (cif->flags == FFI_TYPE_STRUCT)
+    rvalue = r8;
+
+  gpcount = fpcount = 0;
+  for (i = 0, p_arg = cif->arg_types; i < avn; i++, p_arg++)
+    {
+      switch ((*p_arg)->type)
+	{
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT8:
+	  avalue[i] = endian_adjust(&stack->gp_regs[gpcount++], 1);
+	  break;
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT16:
+	  avalue[i] = endian_adjust(&stack->gp_regs[gpcount++], 2);
+	  break;
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_UINT32:
+	  avalue[i] = endian_adjust(&stack->gp_regs[gpcount++], 4);
+	  break;
+	case FFI_TYPE_SINT64:
+	case FFI_TYPE_UINT64:
+	  avalue[i] = &stack->gp_regs[gpcount++];
+	  break;
+	case FFI_TYPE_POINTER:
+	  avalue[i] = endian_adjust(&stack->gp_regs[gpcount++], sizeof(void*));
+	  break;
+
+	case FFI_TYPE_FLOAT:
+	  if (gpcount < 8 && fpcount < 8)
+	    {
+	      void *addr = &stack->fp_regs[fpcount++];
+	      avalue[i] = addr;
+	      *(float *)addr = ldf_fill (addr);
+	    }
+	  else
+	    avalue[i] = endian_adjust(&stack->gp_regs[gpcount], 4);
+	  gpcount++;
+	  break;
+
+	case FFI_TYPE_DOUBLE:
+	  if (gpcount < 8 && fpcount < 8)
+	    {
+	      void *addr = &stack->fp_regs[fpcount++];
+	      avalue[i] = addr;
+	      *(double *)addr = ldf_fill (addr);
+	    }
+	  else
+	    avalue[i] = &stack->gp_regs[gpcount];
+	  gpcount++;
+	  break;
+
+	case FFI_TYPE_LONGDOUBLE:
+	  if (gpcount & 1)
+	    gpcount++;
+	  if (LDBL_MANT_DIG == 64 && gpcount < 8 && fpcount < 8)
+	    {
+	      void *addr = &stack->fp_regs[fpcount++];
+	      avalue[i] = addr;
+	      *(__float80 *)addr = ldf_fill (addr);
+	    }
+	  else
+	    avalue[i] = &stack->gp_regs[gpcount];
+	  gpcount += 2;
+	  break;
+
+	case FFI_TYPE_STRUCT:
+	  {
+	    size_t size = (*p_arg)->size;
+	    size_t align = (*p_arg)->alignment;
+	    int hfa_type = hfa_element_type (*p_arg, 0);
+
+	    FFI_ASSERT (align <= 16);
+	    if (align == 16 && (gpcount & 1))
+	      gpcount++;
+
+	    if (hfa_type != FFI_TYPE_VOID)
+	      {
+		size_t hfa_size = hfa_type_size (hfa_type);
+		size_t offset = 0;
+		size_t gp_offset = gpcount * 8;
+		void *addr = alloca (size);
+
+		avalue[i] = addr;
+
+		while (fpcount < 8
+		       && offset < size
+		       && gp_offset < 8 * 8)
+		  {
+		    hfa_type_store (hfa_type, addr + offset, 
+				    ldf_fill (&stack->fp_regs[fpcount]));
+		    offset += hfa_size;
+		    gp_offset += hfa_size;
+		    fpcount += 1;
+		  }
+
+		if (offset < size)
+		  memcpy (addr + offset, (char *)stack->gp_regs + gp_offset,
+			  size - offset);
+	      }
+	    else
+	      avalue[i] = &stack->gp_regs[gpcount];
+
+	    gpcount += (size + 7) / 8;
+	  }
+	  break;
+
+	default:
+	  abort ();
+	}
+    }
+
+  closure->fun (cif, rvalue, avalue, closure->user_data);
+
+  return cif->flags;
+}
diff --git a/libffi/src/ia64/ffitarget.h b/libffi/src/ia64/ffitarget.h
index 3b7865442d7..2f98d51c429 100644
--- a/libffi/src/ia64/ffitarget.h
+++ b/libffi/src/ia64/ffitarget.h
@@ -45,14 +45,5 @@ typedef enum ffi_abi {
 				/* can be interpreted as a C function	*/
 				/* descriptor:				*/
 
-#ifndef LIBFFI_ASM
-struct ffi_ia64_trampoline_struct {
-    void * code_pointer;	/* Pointer to ffi_closure_UNIX	*/
-    void * fake_gp;		/* Pointer to closure, installed as gp	*/
-    void * real_gp;		/* Real gp value, reinstalled by 	*/
-				/* ffi_closure_UNIX.			*/
-};
-#endif
-
 #endif
 
diff --git a/libffi/src/ia64/ia64_flags.h b/libffi/src/ia64/ia64_flags.h
index 23dbd3e0237..1dd6d7e3feb 100644
--- a/libffi/src/ia64/ia64_flags.h
+++ b/libffi/src/ia64/ia64_flags.h
@@ -25,38 +25,15 @@
    OTHER DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-
-/* Homogeneous Floating Point Aggregates (HFAs) which are returned	*/
-/* in FP registers.  The least significant bits specify the size in 	*/
-/* words.								*/
-#define FFI_IS_FLOAT_FP_AGGREGATE 0x1000
-#define FFI_IS_DOUBLE_FP_AGGREGATE 0x0800
-#define FLOAT_FP_AGGREGATE_BIT 12
-#define DOUBLE_FP_AGGREGATE_BIT 11
-
-/* Small structures containing N words.  If N=1, they are returned	*/
-/* as though they were integers.					*/
-#define FFI_IS_SMALL_STRUCT2	0x40 /* Struct > 8, <=16 bytes	*/
-#define FFI_IS_SMALL_STRUCT3	0x41 /* Struct > 16 <= 24 bytes	*/
-#define FFI_IS_SMALL_STRUCT4	0x42 /* Struct > 24, <=32 bytes	*/
-
-/* Flag values identifying particularly simple cases, which are 	*/
-/* handled specially.  We treat functions as simple if they take all	*/
-/* arguments can be passed as 32 or 64 bit integer quantities, there is	*/
-/* either no return value or it can be treated as a 64bit integer, and	*/
-/* if there are at most 2 arguments.					*/
-/* This is OR'ed with the normal flag values.				*/
-#define FFI_SIMPLE_V 0x10000	/* () -> X	*/
-#define FFI_SIMPLE_I 0x20000	/* (int) -> X	*/
-#define FFI_SIMPLE_L 0x30000	/* (long) -> X	*/
-#define FFI_SIMPLE_II 0x40000	/* (int,int) -> X	*/
-#define FFI_SIMPLE_IL 0x50000	/* (int,long) -> X	*/
-#define FFI_SIMPLE_LI 0x60000	/* (long,int) -> X	*/
-#define FFI_SIMPLE_LL 0x70000	/* (long,long) -> X	*/
-
-/* Mask for all of the FFI_SIMPLE bits:	*/
-#define FFI_SIMPLE 0xf0000
-
-/* An easy way to build FFI_SIMPLE flags from FFI_SIMPLE_V:	*/
-#define FFI_ADD_LONG_ARG(flag) (((flag) << 1) | 0x10000)
-#define FFI_ADD_INT_ARG(flag) ((flag) << 1)
+/* "Type" codes used between assembly and C.  When used as a part of
+   a cfi->flags value, the low byte will be these extra type codes,
+   and bits 8-31 will be the actual size of the type.  */
+
+/* Small structures containing N words in integer registers.  */
+#define FFI_IA64_TYPE_SMALL_STRUCT	(FFI_TYPE_LAST + 1)
+
+/* Homogeneous Floating Point Aggregates (HFAs) which are returned
+   in FP registers.  */
+#define FFI_IA64_TYPE_HFA_FLOAT		(FFI_TYPE_LAST + 2)
+#define FFI_IA64_TYPE_HFA_DOUBLE	(FFI_TYPE_LAST + 3)
+#define FFI_IA64_TYPE_HFA_LDOUBLE	(FFI_TYPE_LAST + 4)
diff --git a/libffi/src/ia64/unix.S b/libffi/src/ia64/unix.S
index be267f60a9a..7c68b2d3a62 100644
--- a/libffi/src/ia64/unix.S
+++ b/libffi/src/ia64/unix.S
@@ -33,295 +33,542 @@
 #include <ffi.h>
 #include "ia64_flags.h"
 
-/* parameters:	*/
-#define callback	in0
-#define ecifp		in1
-#define bytes		in2
-#define flags		in3
-#define raddr		in4
-#define fn		in5
-
-#define FLOAT_SZ	8 /* in-memory size of fp operands	*/
-
-/* Allocate an ia64_args structure on the stack; call ffi_prep_args	*/
-/* to fill it in with argument values; copy those to the real 		*/
-/* registers, leaving overflow arguments on the stack.  Then call fn	*/
-/* and move the result from registers into *raddr.			*/
 	.pred.safe_across_calls p1-p5,p16-p63
 .text
+
+/* int ffi_call_unix (struct ia64_args *stack, PTR64 rvalue,
+		      void (*fn)(), int flags);
+ */
+
         .align 16
-        .global ffi_call_unix
-        .proc ffi_call_unix
+        .global	ffi_call_unix
+        .proc	ffi_call_unix
 ffi_call_unix:
 	.prologue
-	.save	ar.pfs,r38 /* loc0 */
-	alloc   loc0=ar.pfs,6,6,8,0
-	.save	rp,loc1
-	mov 	loc1=b0;
-	.vframe	loc5
-	mov	loc5=sp;
+	/* Bit o trickiness.  We actually share a stack frame with ffi_call.
+	   Rely on the fact that ffi_call uses a vframe and don't bother
+	   tracking one here at all.  */
+	.fframe	0
+	.save	ar.pfs, r36 // loc0
+	alloc   loc0 = ar.pfs, 4, 3, 8, 0
+	.save	rp, loc1
+	mov 	loc1 = b0
 	.body
-	sub	sp=sp,bytes
-	mov	loc4=r1		/* Save gp 	*/
-	ld8	r8=[callback],8	/* code address of callback	*/
-	;;
-	mov 	out0=sp
-	mov	out1=ecifp
-	mov	out2=bytes
-	ld8	r1=[callback]	/* Set up gp for callback.  Unnecessary? */
-	mov	b6=r8
-	;;
-	br.call.sptk.many b0 = b6	/* call ffi_prep_args		*/
-	cmp.eq	p6,p0=0,r8		/* r8 nonzero ==> need fp regs	*/
- 	;;
-(p6)	add	loc2=32+8*FLOAT_SZ,sp
-(p6)	br.cond.dptk.many	fp_done
-	;;	/* Quiets warning; needed?	*/
-	add	loc2=32,sp
-	add	loc3=32+FLOAT_SZ,sp
-	;;
-	ldfd	f8=[loc2],2*FLOAT_SZ
-	ldfd	f9=[loc3],2*FLOAT_SZ
-	;;
-	ldfd	f10=[loc2],2*FLOAT_SZ
-	ldfd	f11=[loc3],2*FLOAT_SZ
-	;;
-	ldfd	f12=[loc2],2*FLOAT_SZ
-	ldfd	f13=[loc3],2*FLOAT_SZ
-	;;
-	ldfd	f14=[loc2],2*FLOAT_SZ
-	ldfd	f15=[loc3]
-	;;
-fp_done:
-	add	r9=16,sp	/* Pointer to r8_contents	*/
-	/* loc2 points at first integer register value.  */
-	add	loc3=8,loc2
-	;;
-	ld8	r8=[r9]		/* Just in case we return large struct */
-	ld8	out0=[loc2],16
-	ld8	out1=[loc3],16
-	;;
-	ld8	out2=[loc2],16
-	ld8	out3=[loc3],16
-	;;
-	ld8	out4=[loc2],16
-	ld8	out5=[loc3],16
-	;;
-	ld8	out6=[loc2]
-	ld8	out7=[loc3]
-        /* Set sp to 16 bytes below the first stack parameter.  This    */
-        /* is the value currently in loc2.                              */
-	mov	sp=loc2
-	
-	ld8 	r8=[fn],8
-	;;
-	ld8	r1=[fn]		/* Set up gp */
-	mov	b6=r8;;
-	br.call.sptk.many b0 = b6	/* call fn	*/
-	
-	/* Handle return value. */
-	cmp.eq	p6,p0=0,raddr
-	cmp.eq	p7,p0=FFI_TYPE_INT,flags
-	cmp.eq	p10,p0=FFI_IS_SMALL_STRUCT2,flags
-	cmp.eq	p11,p0=FFI_IS_SMALL_STRUCT3,flags
-	cmp.eq	p12,p0=FFI_IS_SMALL_STRUCT4,flags
-	;;
-(p6) 	br.cond.dpnt.few done		/* Dont copy ret values if raddr = 0 */
-(p7)	br.cond.dptk.few copy1
-(p10)	br.cond.dpnt.few copy2
-(p11)	br.cond.dpnt.few copy3
-(p12)	br.cond.dpnt.few copy4
-	cmp.eq	p8,p0=FFI_TYPE_FLOAT,flags
-	cmp.eq	p9,p0=FFI_TYPE_DOUBLE,flags
-	tbit.nz	p6,p0=flags,FLOAT_FP_AGGREGATE_BIT
-	tbit.nz	p7,p0=flags,DOUBLE_FP_AGGREGATE_BIT
-	;;
-(p8)	stfs	[raddr]=f8
-(p9)	stfd	[raddr]=f8
+	add	r16 = 16, in0
+	mov	loc2 = gp
+	mov	r8 = in1
+	;;
+
+	/* Load up all of the argument registers.  */
+	ldf.fill f8 = [in0], 32
+	ldf.fill f9 = [r16], 32
+	;;
+	ldf.fill f10 = [in0], 32
+	ldf.fill f11 = [r16], 32
+	;;
+	ldf.fill f12 = [in0], 32
+	ldf.fill f13 = [r16], 32
+	;;
+	ldf.fill f14 = [in0], 32
+	ldf.fill f15 = [r16], 24
+	;;
+	ld8	out0 = [in0], 16
+	ld8	out1 = [r16], 16
+	;;
+	ld8	out2 = [in0], 16
+	ld8	out3 = [r16], 16
+	;;
+	ld8	out4 = [in0], 16
+	ld8	out5 = [r16], 16
+	;;
+	ld8	out6 = [in0]
+	ld8	out7 = [r16]
+	;;
+
+	/* Deallocate the register save area from the stack frame.  */
+	mov	sp = in0
+
+	/* Call the target function.  */
+	ld8	r16 = [in2], 8
+	;;
+	ld8	gp = [in2]
+	mov	b6 = r16
+	br.call.sptk.many b0 = b6
+	;;
+
+	/* Dispatch to handle return value.  */
+	mov	gp = loc2
+	zxt1	r16 = in3
+	;;
+	mov	ar.pfs = loc0
+	addl	r18 = @ltoffx(.Lst_table), gp
+	;;
+	ld8.mov	r18 = [r18], .Lst_table
+	mov	b0 = loc1
+	;;
+	shladd	r18 = r16, 3, r18
+	;;
+	ld8	r17 = [r18]
+	shr	in3 = in3, 8
+	;;
+	add	r17 = r17, r18
+	;;
+	mov	b6 = r17
+	br	b6
+	;;
+
+.Lst_void:
+	br.ret.sptk.many b0
+	;;
+.Lst_uint8:
+	zxt1	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_sint8:
+	sxt1	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_uint16:
+	zxt2	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_sint16:
+	sxt2	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_uint32:
+	zxt4	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_sint32:
+	sxt4	r8 = r8
+	;;
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_int64:
+	st8	[in1] = r8
+	br.ret.sptk.many b0
+	;;
+.Lst_float:
+	stfs	[in1] = f8
+	br.ret.sptk.many b0
+	;;
+.Lst_double:
+	stfd	[in1] = f8
+	br.ret.sptk.many b0
+	;;
+.Lst_ldouble:
+	stfe	[in1] = f8
+	br.ret.sptk.many b0
 	;;
-	.label_state 1
-(p6)	br.cond.dpnt.few handle_float_hfa
-(p7)	br.cond.dpnt.few handle_double_hfa
-	br done
 
-copy4:
-	add	loc3=24,raddr
+.Lst_small_struct:
+	add	sp = -16, sp
+	cmp.lt	p6, p0 = 8, in3
+	cmp.lt	p7, p0 = 16, in3
+	cmp.lt	p8, p0 = 24, in3
+	;;
+	add	r16 = 8, sp
+	add	r17 = 16, sp
+	add	r18 = 24, sp
+	;;
+	st8	[sp] = r8
+(p6)	st8	[r16] = r9
+	mov	out0 = in1
+(p7)	st8	[r17] = r10
+(p8)	st8	[r18] = r11
+	mov	out1 = sp
+	mov	out2 = in3
+	br.call.sptk.many b0 = memcpy#
 	;;
-	st8	[loc3]=r11
-copy3:
-	add	loc3=16,raddr
+	mov	ar.pfs = loc0
+	mov	b0 = loc1
+	mov	gp = loc2
+	br.ret.sptk.many b0
+
+.Lst_hfa_float:
+	add	r16 = 4, in1
+	cmp.lt	p6, p0 = 4, in3
+	;;
+	stfs	[in1] = f8, 8
+(p6)	stfs	[r16] = f9, 8
+	cmp.lt	p7, p0 = 8, in3
+	cmp.lt	p8, p0 = 12, in3
+	;;
+(p7)	stfs	[in1] = f10, 8
+(p8)	stfs	[r16] = f11, 8
+	cmp.lt	p9, p0 = 16, in3
+	cmp.lt	p10, p0 = 20, in3
+	;;
+(p9)	stfs	[in1] = f12, 8
+(p10)	stfs	[r16] = f13, 8
+	cmp.lt	p6, p0 = 24, in3
+	cmp.lt	p7, p0 = 28, in3
+	;;
+(p6)	stfs	[in1] = f14
+(p7)	stfs	[r16] = f15
+	br.ret.sptk.many b0
+	;;
+
+.Lst_hfa_double:
+	add	r16 = 8, in1
+	cmp.lt	p6, p0 = 8, in3
+	;;
+	stfd	[in1] = f8, 16
+(p6)	stfd	[r16] = f9, 16
+	cmp.lt	p7, p0 = 16, in3
+	cmp.lt	p8, p0 = 24, in3
+	;;
+(p7)	stfd	[in1] = f10, 16
+(p8)	stfd	[r16] = f11, 16
+	cmp.lt	p9, p0 = 32, in3
+	cmp.lt	p10, p0 = 40, in3
+	;;
+(p9)	stfd	[in1] = f12, 16
+(p10)	stfd	[r16] = f13, 16
+	cmp.lt	p6, p0 = 48, in3
+	cmp.lt	p7, p0 = 56, in3
+	;;
+(p6)	stfd	[in1] = f14
+(p7)	stfd	[r16] = f15
+	br.ret.sptk.many b0
+	;;
+
+.Lst_hfa_ldouble:
+	add	r16 = 16, in1
+	cmp.lt	p6, p0 = 16, in3
+	;;
+	stfe	[in1] = f8, 32
+(p6)	stfe	[r16] = f9, 32
+	cmp.lt	p7, p0 = 32, in3
+	cmp.lt	p8, p0 = 48, in3
+	;;
+(p7)	stfe	[in1] = f10, 32
+(p8)	stfe	[r16] = f11, 32
+	cmp.lt	p9, p0 = 64, in3
+	cmp.lt	p10, p0 = 80, in3
+	;;
+(p9)	stfe	[in1] = f12, 32
+(p10)	stfe	[r16] = f13, 32
+	cmp.lt	p6, p0 = 96, in3
+	cmp.lt	p7, p0 = 112, in3
+	;;
+(p6)	stfe	[in1] = f14
+(p7)	stfe	[r16] = f15
+	br.ret.sptk.many b0
+	;;
+
+        .endp ffi_call_unix
+
+        .align 16
+        .global ffi_closure_unix
+        .proc ffi_closure_unix
+
+#define FRAME_SIZE	(8*16 + 8*8 + 8*16)
+
+ffi_closure_unix:
+	.prologue
+	.save	ar.pfs, r40 // loc0
+	alloc   loc0 = ar.pfs, 8, 4, 4, 0
+	.fframe	FRAME_SIZE
+	add	r12 = -FRAME_SIZE, r12
+	.save	rp, loc1
+	mov	loc1 = b0
+	.save	ar.unat, loc2
+	mov	loc2 = ar.unat
+	.body
+
+	/* Retrieve closure pointer and real gp.  */
+	mov	out0 = gp
+	add	gp = 16, gp
 	;;
-	st8	[loc3]=r10
-copy2:
-	add	loc3=8,raddr
+	ld8	gp = [gp]
+
+	/* Spill all of the possible argument registers.  */
+	add	r16 = 16 + 8*16, sp
+	add	r17 = 16 + 8*16 + 16, sp
+	;;
+	stf.spill [r16] = f8, 32
+	stf.spill [r17] = f9, 32
+	mov	loc3 = gp
+	;;
+	stf.spill [r16] = f10, 32
+	stf.spill [r17] = f11, 32
+	;;
+	stf.spill [r16] = f12, 32
+	stf.spill [r17] = f13, 32
+	;;
+	stf.spill [r16] = f14, 32
+	stf.spill [r17] = f15, 24
+	;;
+	.mem.offset 0, 0
+	st8.spill [r16] = in0, 16
+	.mem.offset 8, 0
+	st8.spill [r17] = in1, 16
+	add	out1 = 16 + 8*16, sp
+	;;
+	.mem.offset 0, 0
+	st8.spill [r16] = in2, 16
+	.mem.offset 8, 0
+	st8.spill [r17] = in3, 16
+	add	out2 = 16, sp
+	;;
+	.mem.offset 0, 0
+	st8.spill [r16] = in4, 16
+	.mem.offset 8, 0
+	st8.spill [r17] = in5, 16
+	mov	out3 = r8
+	;;
+	.mem.offset 0, 0
+	st8.spill [r16] = in6
+	.mem.offset 8, 0
+	st8.spill [r17] = in7
+
+	/* Invoke ffi_closure_unix_inner for the hard work.  */
+	br.call.sptk.many b0 = ffi_closure_unix_inner
 	;;
-	st8	[loc3]=r9
-copy1:
-	st8	[raddr]=r8
-	/* In the big struct case, raddr was passed as an argument.	*/
-	/* In the void case there was nothing to do.			*/
 
-done:
-	mov	r1=loc4		/* Restore gp	*/
+	/* Dispatch to handle return value.  */
+	mov	gp = loc3
+	zxt1	r16 = r8
+	;;
+	addl	r18 = @ltoffx(.Lld_table), gp
 	mov	ar.pfs = loc0
+	;;
+	ld8.mov	r18 = [r18], .Lld_table
 	mov	b0 = loc1
+	;;
+	shladd	r18 = r16, 3, r18
+	mov	ar.unat = loc2
+	;;
+	ld8	r17 = [r18]
+	shr	r8 = r8, 8
+	;;
+	add	r17 = r17, r18
+	add	r16 = 16, sp
+	;;
+	mov	b6 = r17
+	br	b6
+	;;
+	.label_state 1
+
+.Lld_void:
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_int8:
+	.body
+	.copy_state 1
+	ld1	r8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_int16:
+	.body
+	.copy_state 1
+	ld2	r8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_int32:
+	.body
+	.copy_state 1
+	ld4	r8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_int64:
+	.body
+	.copy_state 1
+	ld8	r8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_float:
+	.body
+	.copy_state 1
+	ldfs	f8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
+.Lld_double:
+	.body
+	.copy_state 1
+	ldfd	f8 = [r16]
 	.restore sp
-	mov	sp = loc5
+	add	sp = FRAME_SIZE, sp
 	br.ret.sptk.many b0
+	;;
+.Lld_ldouble:
+	.body
+	.copy_state 1
+	ldfe	f8 = [r16]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
 
-handle_double_hfa:
+.Lld_small_struct:
 	.body
 	.copy_state 1
-	/* Homogeneous floating point array of doubles is returned in	*/
-	/* registers f8-f15.  Save one at a time to return area.	*/
-	and	flags=0xf,flags	/* Retrieve size	*/
-	;;
-	cmp.eq	p6,p0=2,flags
-	cmp.eq	p7,p0=3,flags
-	cmp.eq	p8,p0=4,flags
-	cmp.eq	p9,p0=5,flags
-	cmp.eq	p10,p0=6,flags
-	cmp.eq	p11,p0=7,flags
-	cmp.eq	p12,p0=8,flags
-	;;
-(p6)	br.cond.dptk.few	dhfa2
-(p7)	br.cond.dptk.few	dhfa3
-(p8)	br.cond.dptk.few	dhfa4
-(p9)	br.cond.dptk.few	dhfa5
-(p10)	br.cond.dptk.few	dhfa6
-(p11)	br.cond.dptk.few	dhfa7
-dhfa8:	add 	loc3=7*8,raddr
-	;;
-	stfd	[loc3]=f15
-dhfa7:	add 	loc3=6*8,raddr
-	;;
-	stfd	[loc3]=f14
-dhfa6:	add 	loc3=5*8,raddr
-	;;
-	stfd	[loc3]=f13
-dhfa5:	add 	loc3=4*8,raddr
-	;;
-	stfd	[loc3]=f12
-dhfa4:	add 	loc3=3*8,raddr
-	;;
-	stfd	[loc3]=f11
-dhfa3:	add 	loc3=2*8,raddr
-	;;
-	stfd	[loc3]=f10
-dhfa2:	add 	loc3=1*8,raddr
-	;;
-	stfd	[loc3]=f9
-	stfd	[raddr]=f8
-	br	done
-
-handle_float_hfa:
-	/* Homogeneous floating point array of floats is returned in	*/
-	/* registers f8-f15.  Save one at a time to return area.	*/
-	and	flags=0xf,flags	/* Retrieve size	*/
-	;;
-	cmp.eq	p6,p0=2,flags
-	cmp.eq	p7,p0=3,flags
-	cmp.eq	p8,p0=4,flags
-	cmp.eq	p9,p0=5,flags
-	cmp.eq	p10,p0=6,flags
-	cmp.eq	p11,p0=7,flags
-	cmp.eq	p12,p0=8,flags
-	;;
-(p6)	br.cond.dptk.few	shfa2
-(p7)	br.cond.dptk.few	shfa3
-(p8)	br.cond.dptk.few	shfa4
-(p9)	br.cond.dptk.few	shfa5
-(p10)	br.cond.dptk.few	shfa6
-(p11)	br.cond.dptk.few	shfa7
-shfa8:	add 	loc3=7*4,raddr
-	;;
-	stfd	[loc3]=f15
-shfa7:	add 	loc3=6*4,raddr
-	;;
-	stfd	[loc3]=f14
-shfa6:	add 	loc3=5*4,raddr
-	;;
-	stfd	[loc3]=f13
-shfa5:	add 	loc3=4*4,raddr
-	;;
-	stfd	[loc3]=f12
-shfa4:	add 	loc3=3*4,raddr
-	;;
-	stfd	[loc3]=f11
-shfa3:	add 	loc3=2*4,raddr
-	;;
-	stfd	[loc3]=f10
-shfa2:	add 	loc3=1*4,raddr
-	;;
-	stfd	[loc3]=f9
-	stfd	[raddr]=f8
-	br	done
+	add	r17 = 8, r16
+	cmp.lt	p6, p0 = 8, r8
+	cmp.lt	p7, p0 = 16, r8
+	cmp.lt	p8, p0 = 24, r8
+	;;
+	ld8	r8 = [r16], 16
+(p6)	ld8	r9 = [r17], 16
+	;;
+(p7)	ld8	r10 = [r16]
+(p8)	ld8	r11 = [r17]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
 
-        .endp ffi_call_unix
+.Lld_hfa_float:
+	.body
+	.copy_state 1
+	add	r17 = 4, r16
+	cmp.lt	p6, p0 = 4, r8
+	;;
+	ldfs	f8 = [r16], 8
+(p6)	ldfs	f9 = [r17], 8
+	cmp.lt	p7, p0 = 8, r8
+	cmp.lt	p8, p0 = 12, r8
+	;;
+(p7)	ldfs	f10 = [r16], 8
+(p8)	ldfs	f11 = [r17], 8
+	cmp.lt	p9, p0 = 16, r8
+	cmp.lt	p10, p0 = 20, r8
+	;;
+(p9)	ldfs	f12 = [r16], 8
+(p10)	ldfs	f13 = [r17], 8
+	cmp.lt	p6, p0 = 24, r8
+	cmp.lt	p7, p0 = 28, r8
+	;;
+(p6)	ldfs	f14 = [r16]
+(p7)	ldfs	f15 = [r17]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
 
+.Lld_hfa_double:
+	.body
+	.copy_state 1
+	add	r17 = 8, r16
+	cmp.lt	p6, p0 = 8, r8
+	;;
+	ldfd	f8 = [r16], 16
+(p6)	ldfd	f9 = [r17], 16
+	cmp.lt	p7, p0 = 16, r8
+	cmp.lt	p8, p0 = 24, r8
+	;;
+(p7)	ldfd	f10 = [r16], 16
+(p8)	ldfd	f11 = [r17], 16
+	cmp.lt	p9, p0 = 32, r8
+	cmp.lt	p10, p0 = 40, r8
+	;;
+(p9)	ldfd	f12 = [r16], 16
+(p10)	ldfd	f13 = [r17], 16
+	cmp.lt	p6, p0 = 48, r8
+	cmp.lt	p7, p0 = 56, r8
+	;;
+(p6)	ldfd	f14 = [r16]
+(p7)	ldfd	f15 = [r17]
+	.restore sp
+	add	sp = FRAME_SIZE, sp
+	br.ret.sptk.many b0
+	;;
 
-	.pred.safe_across_calls p1-p5,p16-p63
-.text
-        .align 16
-        .global ffi_closure_UNIX
-        .proc ffi_closure_UNIX
-ffi_closure_UNIX:
-	.prologue
-	.save 	ar.pfs,r40 /* loc0 */
-	alloc   loc0=ar.pfs,8,3,2,0
-	.save	rp,loc1
-	mov	loc1=b0
-	.vframe	loc2
-	mov	loc2=sp
-	/* Retrieve closure pointer and real gp.	*/
-	mov	out0=gp
-	add	gp=16,gp
-	;;
-	ld8	gp=[gp]
-	/* Reserve a structia64_args on the stack such that arguments	*/
-	/* past the first 8 are automatically placed in the right	*/
-	/* slot.  Note that when we start the sp points at 2 8-byte	*/
-	/* scratch words, followed by the extra arguments.		*/
-#	define BASIC_ARGS_SZ (8*FLOAT_SZ+8*8+2*8)
-#	define FIRST_FP_OFFSET (4*8)
-	add	r14=-(BASIC_ARGS_SZ-FIRST_FP_OFFSET),sp
-	add	r15=-(BASIC_ARGS_SZ-FIRST_FP_OFFSET-FLOAT_SZ),sp
-	add	sp=-BASIC_ARGS_SZ,sp
-	/* r14 points to fp_regs[0], r15 points to fp_regs[1]	*/
-	;;
-	stfd	[r14]=f8,2*FLOAT_SZ
-	stfd	[r15]=f9,2*FLOAT_SZ
-	;;
-	stfd	[r14]=f10,2*FLOAT_SZ
-	stfd	[r15]=f11,2*FLOAT_SZ
-	;;
-	stfd	[r14]=f12,2*FLOAT_SZ
-	stfd	[r15]=f13,2*FLOAT_SZ
-	;;
-	stfd	[r14]=f14,2*FLOAT_SZ
-	stfd	[r15]=f15,FLOAT_SZ+8
-	;;
-	/* r14 points to first parameter register area, r15 to second. */
-	st8	[r14]=in0,2*8
-	st8	[r15]=in1,2*8
-	;;
-	st8	[r14]=in2,2*8
-	st8	[r15]=in3,2*8
-	;;
-	st8	[r14]=in4,2*8
-	st8	[r15]=in5,2*8
-	;;
-	st8	[r14]=in6,2*8
-	st8	[r15]=in7,2*8
-	/* Call ffi_closure_UNIX_inner */
-	mov	out1=sp
-	br.call.sptk.many b0=ffi_closure_UNIX_inner
-	;;
-	mov	b0=loc1
-	mov 	ar.pfs=loc0
+.Lld_hfa_ldouble:
+	.body
+	.copy_state 1
+	add	r17 = 16, r16
+	cmp.lt	p6, p0 = 16, r8
+	;;
+	ldfe	f8 = [r16], 32
+(p6)	ldfe	f9 = [r17], 32
+	cmp.lt	p7, p0 = 32, r8
+	cmp.lt	p8, p0 = 48, r8
+	;;
+(p7)	ldfe	f10 = [r16], 32
+(p8)	ldfe	f11 = [r17], 32
+	cmp.lt	p9, p0 = 64, r8
+	cmp.lt	p10, p0 = 80, r8
+	;;
+(p9)	ldfe	f12 = [r16], 32
+(p10)	ldfe	f13 = [r17], 32
+	cmp.lt	p6, p0 = 96, r8
+	cmp.lt	p7, p0 = 112, r8
+	;;
+(p6)	ldfe	f14 = [r16]
+(p7)	ldfe	f15 = [r17]
 	.restore sp
-	mov	sp=loc2
+	add	sp = FRAME_SIZE, sp
 	br.ret.sptk.many b0
-	.endp ffi_closure_UNIX
-	
+	;;
+
+	.endp	ffi_closure_unix
+
+	.section .rodata
+	.align	8
+.Lst_table:
+	data8	@pcrel(.Lst_void)		// FFI_TYPE_VOID
+	data8	@pcrel(.Lst_sint32)		// FFI_TYPE_INT
+	data8	@pcrel(.Lst_float)		// FFI_TYPE_FLOAT
+	data8	@pcrel(.Lst_double)		// FFI_TYPE_DOUBLE
+	data8	@pcrel(.Lst_ldouble)		// FFI_TYPE_LONGDOUBLE
+	data8	@pcrel(.Lst_uint8)		// FFI_TYPE_UINT8
+	data8	@pcrel(.Lst_sint8)		// FFI_TYPE_SINT8
+	data8	@pcrel(.Lst_uint16)		// FFI_TYPE_UINT16
+	data8	@pcrel(.Lst_sint16)		// FFI_TYPE_SINT16
+	data8	@pcrel(.Lst_uint32)		// FFI_TYPE_UINT32
+	data8	@pcrel(.Lst_sint32)		// FFI_TYPE_SINT32
+	data8	@pcrel(.Lst_int64)		// FFI_TYPE_UINT64
+	data8	@pcrel(.Lst_int64)		// FFI_TYPE_SINT64
+	data8	@pcrel(.Lst_void)		// FFI_TYPE_STRUCT
+	data8	@pcrel(.Lst_int64)		// FFI_TYPE_POINTER
+	data8 	@pcrel(.Lst_small_struct)	// FFI_IA64_TYPE_SMALL_STRUCT
+	data8	@pcrel(.Lst_hfa_float)		// FFI_IA64_TYPE_HFA_FLOAT
+	data8	@pcrel(.Lst_hfa_double)		// FFI_IA64_TYPE_HFA_DOUBLE
+	data8	@pcrel(.Lst_hfa_ldouble)	// FFI_IA64_TYPE_HFA_LDOUBLE
 
+.Lld_table:
+	data8	@pcrel(.Lld_void)		// FFI_TYPE_VOID
+	data8	@pcrel(.Lld_int32)		// FFI_TYPE_INT
+	data8	@pcrel(.Lld_float)		// FFI_TYPE_FLOAT
+	data8	@pcrel(.Lld_double)		// FFI_TYPE_DOUBLE
+	data8	@pcrel(.Lld_ldouble)		// FFI_TYPE_LONGDOUBLE
+	data8	@pcrel(.Lld_int8)		// FFI_TYPE_UINT8
+	data8	@pcrel(.Lld_int8)		// FFI_TYPE_SINT8
+	data8	@pcrel(.Lld_int16)		// FFI_TYPE_UINT16
+	data8	@pcrel(.Lld_int16)		// FFI_TYPE_SINT16
+	data8	@pcrel(.Lld_int32)		// FFI_TYPE_UINT32
+	data8	@pcrel(.Lld_int32)		// FFI_TYPE_SINT32
+	data8	@pcrel(.Lld_int64)		// FFI_TYPE_UINT64
+	data8	@pcrel(.Lld_int64)		// FFI_TYPE_SINT64
+	data8	@pcrel(.Lld_void)		// FFI_TYPE_STRUCT
+	data8	@pcrel(.Lld_int64)		// FFI_TYPE_POINTER
+	data8 	@pcrel(.Lld_small_struct)	// FFI_IA64_TYPE_SMALL_STRUCT
+	data8	@pcrel(.Lld_hfa_float)		// FFI_IA64_TYPE_HFA_FLOAT
+	data8	@pcrel(.Lld_hfa_double)		// FFI_IA64_TYPE_HFA_DOUBLE
+	data8	@pcrel(.Lld_hfa_ldouble)	// FFI_IA64_TYPE_HFA_LDOUBLE
diff --git a/libffi/src/types.c b/libffi/src/types.c
index 47ee65637e6..06c9e6939c9 100644
--- a/libffi/src/types.c
+++ b/libffi/src/types.c
@@ -42,23 +42,9 @@ FFI_INTEGRAL_TYPEDEF(uint32, 4, 4, FFI_TYPE_UINT32);
 FFI_INTEGRAL_TYPEDEF(sint32, 4, 4, FFI_TYPE_SINT32);
 FFI_INTEGRAL_TYPEDEF(float, 4, 4, FFI_TYPE_FLOAT);
 
-#if defined ALPHA || defined SPARC64 || defined X86_64 || defined S390X \
-    || defined IA64 || defined POWERPC64
+FFI_INTEGRAL_TYPEDEF(pointer, sizeof(void*), sizeof(void*), FFI_TYPE_POINTER);
 
-FFI_INTEGRAL_TYPEDEF(pointer, 8, 8, FFI_TYPE_POINTER);
-
-#else
-
-FFI_INTEGRAL_TYPEDEF(pointer, 4, 4, FFI_TYPE_POINTER);
-
-#endif
-
-#if defined X86 || defined ARM || defined M68K
-
-FFI_INTEGRAL_TYPEDEF(uint64, 8, 4, FFI_TYPE_UINT64);
-FFI_INTEGRAL_TYPEDEF(sint64, 8, 4, FFI_TYPE_SINT64);
-
-#elif defined SH
+#if defined X86 || defined ARM || defined M68K || SH
 
 FFI_INTEGRAL_TYPEDEF(uint64, 8, 4, FFI_TYPE_UINT64);
 FFI_INTEGRAL_TYPEDEF(sint64, 8, 4, FFI_TYPE_SINT64);
@@ -99,7 +85,7 @@ FFI_INTEGRAL_TYPEDEF(longdouble, 16, 16, FFI_TYPE_LONGDOUBLE);
 FFI_INTEGRAL_TYPEDEF(longdouble, 16, 8, FFI_TYPE_LONGDOUBLE);
 #endif
 
-#elif defined X86_64 || defined POWERPC64
+#elif defined X86_64 || defined POWERPC64 || defined IA64
 
 FFI_INTEGRAL_TYPEDEF(double, 8, 8, FFI_TYPE_DOUBLE);
 FFI_INTEGRAL_TYPEDEF(longdouble, 16, 16, FFI_TYPE_LONGDOUBLE);
@@ -110,4 +96,3 @@ FFI_INTEGRAL_TYPEDEF(double, 8, 8, FFI_TYPE_DOUBLE);
 FFI_INTEGRAL_TYPEDEF(longdouble, 8, 8, FFI_TYPE_LONGDOUBLE);
 
 #endif
-
-- 
cgit v1.2.3