1 files changed, 623 insertions, 0 deletions
diff --git a/libgupc/collectives/upc_coll_prefix_reduce.in b/libgupc/collectives/upc_coll_prefix_reduce.in
new file mode 100644
index 00000000000..82903d96f5d
--- /dev/null
+++ b/libgupc/collectives/upc_coll_prefix_reduce.in
@@ -0,0 +1,623 @@
+/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Copyright (c) 2004, Michigan Technological University                    */
+/*  All rights reserved.                                                     */
+/*                                                                           */
+/*  Redistribution and use in source and binary forms, with or without       */
+/*  modification, are permitted provided that the following conditions       */
+/*  are met:                                                                 */
+/*                                                                           */
+/*  * Redistributions of source code must retain the above copyright         */
+/*  notice, this list of conditions and the following disclaimer.            */
+/*  * Redistributions in binary form must reproduce the above                */
+/*  copyright notice, this list of conditions and the following              */
+/*  disclaimer in the documentation and/or other materials provided          */
+/*  with the distribution.                                                   */
+/*  * Neither the name of the Michigan Technological University              */
+/*  nor the names of its contributors may be used to endorse or promote      */
+/*  products derived from this software without specific prior written       */
+/*  permission.                                                              */
+/*                                                                           */
+/*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      */
+/*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        */
+/*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A  */
+/*  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER */
+/*  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
+/*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,      */
+/*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR       */
+/*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   */
+/*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     */
+/*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       */
+/*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             */
+/*                                                                           */
+/*****************************************************************************/
+
+#include <upc.h>
+#include <upc_collective.h>
+#include <upc_coll.h>
+
+/*****************************************************************************/
+/*                                                                           */
+/*        UPC collective function library, reference implementation          */
+/*                                                                           */
+/*   Steve Seidel, Dept. of Computer Science, Michigan Technological Univ.   */
+/*   steve@mtu.edu                                        March 1, 2004      */
+/*                                                                           */
+/*****************************************************************************/
+
+// NOTE: Contrary to the spec, this implementation assumes that the phases
+//       of the src and dst arguments agree.
+
+/* The true set of function names is in upc_all_collectives.c */
+
+PREPROCESS_BEGIN
+void upc_all_prefix_reduce_GENERIC
+(shared void *dst,
+ shared const void *src,
+ upc_op_t op,
+ size_t nelems,
+ size_t blk_size,
+ _UPC_RED_T (*func) (_UPC_RED_T, _UPC_RED_T), upc_flag_t sync_mode)
+{
+
+/*
+	This prefix reduce algorithm is linear in the number of array elements/THREADS.
+	The prefixes are calculated in a loop that iterates as many times+1
+	as the src array wraps from thread THREADS-1 to thread 0.  The cost of
+	computing prefixes of an array with a small block size (that wraps many
+	times) is likely to be much higher than the cost of computing prefixes
+	of an array of the same size but with block size [*].  This code favors
+	the affinity of the src array.  Thus, if the affinities of the src and dst
+	arrays are different, there will many off-thread references (i.e. writes,
+	that is, "pushes") to the dst array.
+
+	Each iteration contains two internal barriers and about THREADS off-thread
+	reads and writes in the best case.  The pref pointer switches between the
+	first and second halves of the array so that one thread can be working
+	an iteration ahead (or behind) of another without interference.  Otherwise,
+	an additional barrier would be needed.
+
+	Allocate pref[2*THREADS]
+	Determine the number of times the array "wraps".
+	Compute the offset address and number of local elements for 1st pass.
+	for (w=0; w<=wraps; ++w)
+		Initialize local prefix "sum"
+		Compute local prefixes
+		if (MYTHREAD < THREADS-1)
+			Write rightmost local prefix to pref[MYTHREAD+1]
+		barrier
+		if (MYTHREAD == THREADS-1)
+			if (w>0)
+				pref[0] = last "sum" from previous iteration
+			Compute prefixes in pref[]
+		barrier
+		"Add" pref[MYTHREAD] to each prefix computed at top of loop
+		if (wraps > w)
+			Swap pointer to first or last half of pref[]
+			Increment offset address and compute n_local for next pass
+	barrier
+	free pref[]
+
+	Thread THREADS-1 was chosen to handle pref[] simply to avoid giving
+	thread 0 more work.  "push" and "pull" versions of this collective
+	function are not distinguished.  As it stands, the writes to pref[]
+	are "pushes" and the reads from pref[] are "pulls".  If the affinities
+	of src and dst differ, this function can be regarded as a "push"
+	because the affinity of the src array is favored.
+*/
+
+  int				// constant over all iterations
+    src_thr,			// source thread
+    dst_thr,			// destination thread
+    phase,			// phase of src and dst array must be identical
+    wraps,			// number of times src array properly wraps around
+    // first iteration only
+    leaders,			// number of vacant positions to left of src
+    // modified on each iteration
+    i, j, k,			// addressing indices
+    w,				// main loop index for "wraps"
+    src_offset,			// offset from src of first local element
+    dst_offset,			// offset from dst of first element
+    first_thr,			// first thread that contains local src elements
+    last_thr,			// last thread that contains local src elements
+    row_elems,			// number of src elements in the row processed
+    n_local,			// number of local src elements
+    rem_elems;			// number of elements remaining to be processed
+
+  shared _UPC_RED_T *pref;	// shared array to hold local prefixes
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_PRED);
+#endif
+
+  // Synchronize using barriers in the cases of MYSYNC and ALLSYNC.
+
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+
+    upc_barrier;
+
+  // This array is used to share local prefixes.
+
+  pref = upc_all_alloc (2 * THREADS, sizeof (_UPC_RED_T));
+
+  src_thr = upc_threadof ((shared void *) src);
+  phase = upc_phaseof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+
+  // Total number of elements remaining to be processed.
+
+  rem_elems = nelems;
+
+  // Determine offsets in all threads as if there really are elements in all
+  // threads.  Addresses will be src + offset.  (If MYTHREAD < src_thr, MYTHREAD
+  // has no elements for this iteration but it might have elements for next
+  // iteration.)  Note: offset is sometimes negative because src is addressed
+  // here as if its block size is 1.  Similar comments apply to dst.
+
+  if (MYTHREAD != src_thr)
+    {
+      src_offset = MYTHREAD - src_thr - phase * THREADS;
+      dst_offset = src_offset;
+
+      // The following arithmetic is undocumentable.
+      if (MYTHREAD >= THREADS + src_thr - dst_thr)
+	dst_offset += (blk_size - 1) * THREADS;
+      if (MYTHREAD < src_thr - dst_thr)
+	dst_offset -= (blk_size - 1) * THREADS;
+    }
+  else
+    {
+      src_offset = 0;
+      dst_offset = 0;
+    }
+
+  // first_thr .. last_thr is range of threads that contains src elements of current row
+
+  first_thr = src_thr;
+
+  // Compute n_local, the number of src elements local to this thread,
+
+  if (blk_size == 0 || phase + nelems <= blk_size)
+    {				// All elements are on the src_thr.
+
+      leaders = 0;		// (Not needed here.  Initialize for debugging output.)
+      row_elems = nelems;
+      wraps = 0;
+
+      if (MYTHREAD == src_thr)
+	n_local = nelems;
+      else
+	n_local = 0;
+      last_thr = src_thr;
+    }
+  else				// At least two threads contain elements.
+    {
+      // Determine how many elements are in the first row.
+      leaders = src_thr * blk_size + phase;
+      if ((leaders + nelems) / (blk_size * THREADS) > 0)	//first row is "full"
+	row_elems = blk_size * THREADS - leaders;
+      else
+	row_elems = nelems;
+
+      // Determine how many rows wrap back around to thread 0.
+
+      wraps = (leaders + nelems - 1) / (blk_size * THREADS);
+
+      // Assume most likely situation; modify if necessary
+
+      last_thr = THREADS - 1;
+      n_local = blk_size;
+
+      if (MYTHREAD == src_thr)
+	n_local = blk_size - phase;
+      if (MYTHREAD < src_thr)
+	n_local = 0;
+      if (leaders + nelems < blk_size * THREADS)
+	{
+	  // There are not enough elements to fill the
+	  // end of the first row.  Assert: wraps = 0
+	  last_thr = (leaders + nelems - 1) / blk_size;
+	  if ((MYTHREAD == last_thr) && (last_thr > src_thr))
+	    n_local = (leaders + nelems) - (MYTHREAD * blk_size);
+	  else if (MYTHREAD > last_thr)
+	    n_local = 0;
+	}
+    }
+
+
+//////// Main loop.
+
+  for (w = 0; w <= wraps; ++w)
+    {
+
+      if (n_local > 0)
+	{
+	  // Initialize first element.
+
+	  *((shared _UPC_RED_T *) dst + dst_offset)
+	    = *((shared const _UPC_RED_T *) src + src_offset);
+
+	  // Compute local prefixes.
+
+	  switch (op)
+	    {
+	    case UPC_ADD:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    + *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_MULT:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    * *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+#ifndef _UPC_NONINT_T
+	      // Skip if not integral type, per spec 4.3.1.1
+	      // (See additional comments in upc_collective.c)
+	    case UPC_AND:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    & *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_OR:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    | *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_XOR:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    ^ *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+#endif // _UPC_NOINT_T
+	    case UPC_LOGAND:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    && *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_LOGOR:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = *((shared const _UPC_RED_T *) dst + i)
+		    || *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_MIN:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  if (*((shared const _UPC_RED_T *) dst + i)
+		      < *((shared const _UPC_RED_T *) src + k))
+		    *((shared _UPC_RED_T *) dst + j)
+		      = *((shared const _UPC_RED_T *) dst + i);
+		  else
+		    *((shared _UPC_RED_T *) dst + j)
+		      = *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_MAX:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  if (*((shared const _UPC_RED_T *) dst + i)
+		      > *((shared const _UPC_RED_T *) src + k))
+		    *((shared _UPC_RED_T *) dst + j)
+		      = *((shared const _UPC_RED_T *) dst + i);
+		  else
+		    *((shared _UPC_RED_T *) dst + j)
+		      = *((shared const _UPC_RED_T *) src + k);
+		}
+	      break;
+	    case UPC_FUNC:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = func (*((shared const _UPC_RED_T *) dst + i),
+			    *((shared const _UPC_RED_T *) src + k));
+		}
+	      break;
+	    case UPC_NONCOMM_FUNC:
+	      for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS;
+		   k < (src_offset + n_local * THREADS);
+		   i = j, j += THREADS, k += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + j)
+		    = func (*((shared const _UPC_RED_T *) dst + i),
+			    *((shared const _UPC_RED_T *) src + k));
+		}
+	      break;
+	    }
+
+	  if (MYTHREAD < THREADS - 1)
+	    // Write last prefix to shared array.  (This is spurious,
+	    // e.g., sometimes when n_local < blk_size.)
+
+	    pref[MYTHREAD + 1] = *((shared const _UPC_RED_T *) dst
+				   + dst_offset + (n_local - 1) * THREADS);
+	}
+
+      upc_barrier;
+
+      // Now thread THREADS-1 computes prefixes of pref[first_thr..last_thr]
+      // even if it doesn't contain any elements itself.
+
+      if (MYTHREAD == THREADS - 1)
+	{
+	  if (w > 0)		// (first_thr is always 0 in this case)
+
+	    // Write the last prefix computed on the *previous* iteration.
+
+	    pref[0] =
+	      *((shared const _UPC_RED_T *) dst + dst_offset - THREADS);
+
+	  else			// On the first iteration the source thread has no left neighbor.
+
+	    ++first_thr;
+
+	  // Compute prefixes in pref[first_thr..last_thr].
+
+	  switch (op)
+	    {
+	    case UPC_ADD:
+	      {
+		for (i = first_thr + 1; i <= last_thr; ++i)
+		  pref[i] += pref[i - 1];
+	      }
+	      break;
+	    case UPC_MULT:
+	      {
+		for (i = first_thr + 1; i <= last_thr; ++i)
+		  pref[i] *= pref[i - 1];
+	      }
+	      break;
+#ifndef _UPC_NONINT_T
+	      // Skip if not integral type, per spec 4.3.1.1
+	      // (See additional comments in upc_collective.c)
+	    case UPC_AND:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] &= pref[i - 1];
+	      break;
+	    case UPC_OR:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] |= pref[i - 1];
+	      break;
+	    case UPC_XOR:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] ^= pref[i - 1];
+	      break;
+#endif // _UPC_NOINT_T
+	    case UPC_LOGAND:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] = pref[i - 1] && pref[i];
+	      break;
+	    case UPC_LOGOR:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] = pref[i - 1] || pref[i];
+	      break;
+	    case UPC_MIN:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		if (pref[i - 1] < pref[i])
+		  pref[i] = pref[i - 1];
+	      break;
+	    case UPC_MAX:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		if (pref[i - 1] > pref[i])
+		  pref[i] = pref[i - 1];
+	      break;
+	    case UPC_FUNC:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] = func (pref[i - 1], pref[i]);
+	      break;
+	    case UPC_NONCOMM_FUNC:
+	      for (i = first_thr + 1; i <= last_thr; ++i)
+		pref[i] = func (pref[i - 1], pref[i]);
+	      break;
+	    }
+	}			// THREAD-1 is done doing it's special job.
+
+      upc_barrier;
+
+      if ((n_local > 0) && ((w > 0) || (MYTHREAD > src_thr)))
+	{
+	  // "Add" pref[MYTHREAD] to local prefixes.  (On the first
+	  // iteration, only threads beyond src_thr should do this.)
+
+	  switch (op)
+	    {
+	    case UPC_ADD:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + i) += pref[MYTHREAD];
+		}
+	      break;
+	    case UPC_MULT:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		{
+		  *((shared _UPC_RED_T *) dst + i) *= pref[MYTHREAD];
+		}
+	      break;
+#ifndef _UPC_NONINT_T
+	      // Skip if not integral type, per spec 4.3.1.1
+	      // (See additional comments in upc_collective.c)
+	    case UPC_AND:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) &= pref[MYTHREAD];
+	      break;
+	    case UPC_OR:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) |= pref[MYTHREAD];
+	      break;
+	    case UPC_XOR:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) *= pref[MYTHREAD];
+	      break;
+#endif // _UPC_NOINT_T
+	    case UPC_LOGAND:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) =
+		  *((shared _UPC_RED_T *) dst + i) && pref[MYTHREAD];
+	      break;
+	    case UPC_LOGOR:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) =
+		  *((shared _UPC_RED_T *) dst + i) || pref[MYTHREAD];
+	      break;
+	    case UPC_MIN:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		if (pref[MYTHREAD] < *((shared _UPC_RED_T *) dst + i))
+		  *((shared _UPC_RED_T *) dst + i) = pref[MYTHREAD];
+	      break;
+	    case UPC_MAX:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		if (pref[MYTHREAD] > *((shared _UPC_RED_T *) dst + i))
+		  *((shared _UPC_RED_T *) dst + i) = pref[MYTHREAD];
+	      break;
+	    case UPC_FUNC:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) =
+		  func (pref[MYTHREAD], *((shared _UPC_RED_T *) dst + i));
+	      break;
+	    case UPC_NONCOMM_FUNC:
+	      for (i = dst_offset; i < (dst_offset + n_local * THREADS);
+		   i += THREADS)
+		*((shared _UPC_RED_T *) dst + i) =
+		  func (pref[MYTHREAD], *((shared _UPC_RED_T *) dst + i));
+	      break;
+	    }
+	}
+
+      if (wraps > w)		// Set up for the next iteration.
+	{
+	  // swap between two halves of pref array
+	  if (w % 2 == 0)
+	    pref = pref + THREADS;
+	  else
+	    pref = pref - THREADS;
+
+	  rem_elems -= row_elems;
+
+	  if (rem_elems / (blk_size * THREADS) > 0)	// next row is "full"
+	    row_elems = blk_size * THREADS;
+	  else
+	    row_elems = rem_elems;
+
+	  first_thr = 0;
+	  last_thr = (row_elems - 1) / blk_size;
+
+	  n_local = blk_size;
+	  if ((MYTHREAD == last_thr) && (row_elems % blk_size > 0))
+	    n_local = row_elems % blk_size;
+	  else if (MYTHREAD > last_thr)
+	    n_local = 0;
+
+	  // Handle phase > 0 on first iteration
+
+	  if ((w == 0) && (MYTHREAD == src_thr))
+	    {
+	      src_offset -= phase * THREADS;
+	      dst_offset -= phase * THREADS;
+	    }
+
+	  // All the preceding work makes this easy:
+
+	  src_offset += blk_size * THREADS;
+	  dst_offset += blk_size * THREADS;
+	}
+
+    }				// End of main "wrap" loop
+
+  // Synchronize using barriers in the cases of MYSYNC and ALLSYNC.
+
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+
+    upc_barrier;
+  else
+    // we have to synchronize anyway to free the pref array
+    upc_barrier;
+
+  if (MYTHREAD == THREADS - 1)
+    {
+      if (w % 2 == 0)
+	pref -= THREADS;	/* DOB: be sure we free the original pointer! */
+      upc_free (pref);
+    }
+}