diff options
Diffstat (limited to 'libgupc/collectives/upc_coll_prefix_reduce.in')
-rw-r--r-- | libgupc/collectives/upc_coll_prefix_reduce.in | 623 |
1 files changed, 623 insertions, 0 deletions
diff --git a/libgupc/collectives/upc_coll_prefix_reduce.in b/libgupc/collectives/upc_coll_prefix_reduce.in new file mode 100644 index 00000000000..82903d96f5d --- /dev/null +++ b/libgupc/collectives/upc_coll_prefix_reduce.in @@ -0,0 +1,623 @@ +/* Copyright (C) 2012-2016 Free Software Foundation, Inc. + This file is part of the UPC runtime library. + Written by Gary Funck <gary@intrepid.com> + and Nenad Vukicevic <nenad@intrepid.com> + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/*****************************************************************************/ +/* */ +/* Copyright (c) 2004, Michigan Technological University */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or without */ +/* modification, are permitted provided that the following conditions */ +/* are met: */ +/* */ +/* * Redistributions of source code must retain the above copyright */ +/* notice, this list of conditions and the following disclaimer. */ +/* * Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials provided */ +/* with the distribution. */ +/* * Neither the name of the Michigan Technological University */ +/* nor the names of its contributors may be used to endorse or promote */ +/* products derived from this software without specific prior written */ +/* permission. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */ +/* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */ +/* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A */ +/* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER */ +/* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */ +/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, */ +/* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR */ +/* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING */ +/* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS */ +/* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*****************************************************************************/ + +#include <upc.h> +#include <upc_collective.h> +#include <upc_coll.h> + +/*****************************************************************************/ +/* */ +/* UPC collective function library, reference implementation */ +/* */ +/* Steve Seidel, Dept. of Computer Science, Michigan Technological Univ. */ +/* steve@mtu.edu March 1, 2004 */ +/* */ +/*****************************************************************************/ + +// NOTE: Contrary to the spec, this implementation assumes that the phases +// of the src and dst arguments agree. + +/* The true set of function names is in upc_all_collectives.c */ + +PREPROCESS_BEGIN +void upc_all_prefix_reduce_GENERIC +(shared void *dst, + shared const void *src, + upc_op_t op, + size_t nelems, + size_t blk_size, + _UPC_RED_T (*func) (_UPC_RED_T, _UPC_RED_T), upc_flag_t sync_mode) +{ + +/* + This prefix reduce algorithm is linear in the number of array elements/THREADS. + The prefixes are calculated in a loop that iterates as many times+1 + as the src array wraps from thread THREADS-1 to thread 0. The cost of + computing prefixes of an array with a small block size (that wraps many + times) is likely to be much higher than the cost of computing prefixes + of an array of the same size but with block size [*]. This code favors + the affinity of the src array. Thus, if the affinities of the src and dst + arrays are different, there will many off-thread references (i.e. writes, + that is, "pushes") to the dst array. + + Each iteration contains two internal barriers and about THREADS off-thread + reads and writes in the best case. The pref pointer switches between the + first and second halves of the array so that one thread can be working + an iteration ahead (or behind) of another without interference. Otherwise, + an additional barrier would be needed. + + Allocate pref[2*THREADS] + Determine the number of times the array "wraps". + Compute the offset address and number of local elements for 1st pass. + for (w=0; w<=wraps; ++w) + Initialize local prefix "sum" + Compute local prefixes + if (MYTHREAD < THREADS-1) + Write rightmost local prefix to pref[MYTHREAD+1] + barrier + if (MYTHREAD == THREADS-1) + if (w>0) + pref[0] = last "sum" from previous iteration + Compute prefixes in pref[] + barrier + "Add" pref[MYTHREAD] to each prefix computed at top of loop + if (wraps > w) + Swap pointer to first or last half of pref[] + Increment offset address and compute n_local for next pass + barrier + free pref[] + + Thread THREADS-1 was chosen to handle pref[] simply to avoid giving + thread 0 more work. "push" and "pull" versions of this collective + function are not distinguished. As it stands, the writes to pref[] + are "pushes" and the reads from pref[] are "pulls". If the affinities + of src and dst differ, this function can be regarded as a "push" + because the affinity of the src array is favored. +*/ + + int // constant over all iterations + src_thr, // source thread + dst_thr, // destination thread + phase, // phase of src and dst array must be identical + wraps, // number of times src array properly wraps around + // first iteration only + leaders, // number of vacant positions to left of src + // modified on each iteration + i, j, k, // addressing indices + w, // main loop index for "wraps" + src_offset, // offset from src of first local element + dst_offset, // offset from dst of first element + first_thr, // first thread that contains local src elements + last_thr, // last thread that contains local src elements + row_elems, // number of src elements in the row processed + n_local, // number of local src elements + rem_elems; // number of elements remaining to be processed + + shared _UPC_RED_T *pref; // shared array to hold local prefixes + + if (!upc_coll_init_flag) + upc_coll_init (); + + if (blk_size == 0) + blk_size = nelems; + +#ifdef _UPC_COLL_CHECK_ARGS + upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_PRED); +#endif + + // Synchronize using barriers in the cases of MYSYNC and ALLSYNC. + + if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode)) + + upc_barrier; + + // This array is used to share local prefixes. + + pref = upc_all_alloc (2 * THREADS, sizeof (_UPC_RED_T)); + + src_thr = upc_threadof ((shared void *) src); + phase = upc_phaseof ((shared void *) src); + dst_thr = upc_threadof ((shared void *) dst); + + // Total number of elements remaining to be processed. + + rem_elems = nelems; + + // Determine offsets in all threads as if there really are elements in all + // threads. Addresses will be src + offset. (If MYTHREAD < src_thr, MYTHREAD + // has no elements for this iteration but it might have elements for next + // iteration.) Note: offset is sometimes negative because src is addressed + // here as if its block size is 1. Similar comments apply to dst. + + if (MYTHREAD != src_thr) + { + src_offset = MYTHREAD - src_thr - phase * THREADS; + dst_offset = src_offset; + + // The following arithmetic is undocumentable. + if (MYTHREAD >= THREADS + src_thr - dst_thr) + dst_offset += (blk_size - 1) * THREADS; + if (MYTHREAD < src_thr - dst_thr) + dst_offset -= (blk_size - 1) * THREADS; + } + else + { + src_offset = 0; + dst_offset = 0; + } + + // first_thr .. last_thr is range of threads that contains src elements of current row + + first_thr = src_thr; + + // Compute n_local, the number of src elements local to this thread, + + if (blk_size == 0 || phase + nelems <= blk_size) + { // All elements are on the src_thr. + + leaders = 0; // (Not needed here. Initialize for debugging output.) + row_elems = nelems; + wraps = 0; + + if (MYTHREAD == src_thr) + n_local = nelems; + else + n_local = 0; + last_thr = src_thr; + } + else // At least two threads contain elements. + { + // Determine how many elements are in the first row. + leaders = src_thr * blk_size + phase; + if ((leaders + nelems) / (blk_size * THREADS) > 0) //first row is "full" + row_elems = blk_size * THREADS - leaders; + else + row_elems = nelems; + + // Determine how many rows wrap back around to thread 0. + + wraps = (leaders + nelems - 1) / (blk_size * THREADS); + + // Assume most likely situation; modify if necessary + + last_thr = THREADS - 1; + n_local = blk_size; + + if (MYTHREAD == src_thr) + n_local = blk_size - phase; + if (MYTHREAD < src_thr) + n_local = 0; + if (leaders + nelems < blk_size * THREADS) + { + // There are not enough elements to fill the + // end of the first row. Assert: wraps = 0 + last_thr = (leaders + nelems - 1) / blk_size; + if ((MYTHREAD == last_thr) && (last_thr > src_thr)) + n_local = (leaders + nelems) - (MYTHREAD * blk_size); + else if (MYTHREAD > last_thr) + n_local = 0; + } + } + + +//////// Main loop. + + for (w = 0; w <= wraps; ++w) + { + + if (n_local > 0) + { + // Initialize first element. + + *((shared _UPC_RED_T *) dst + dst_offset) + = *((shared const _UPC_RED_T *) src + src_offset); + + // Compute local prefixes. + + switch (op) + { + case UPC_ADD: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + + *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_MULT: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + * *((shared const _UPC_RED_T *) src + k); + } + break; +#ifndef _UPC_NONINT_T + // Skip if not integral type, per spec 4.3.1.1 + // (See additional comments in upc_collective.c) + case UPC_AND: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + & *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_OR: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + | *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_XOR: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + ^ *((shared const _UPC_RED_T *) src + k); + } + break; +#endif // _UPC_NOINT_T + case UPC_LOGAND: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + && *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_LOGOR: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i) + || *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_MIN: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + if (*((shared const _UPC_RED_T *) dst + i) + < *((shared const _UPC_RED_T *) src + k)) + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i); + else + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_MAX: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + if (*((shared const _UPC_RED_T *) dst + i) + > *((shared const _UPC_RED_T *) src + k)) + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) dst + i); + else + *((shared _UPC_RED_T *) dst + j) + = *((shared const _UPC_RED_T *) src + k); + } + break; + case UPC_FUNC: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = func (*((shared const _UPC_RED_T *) dst + i), + *((shared const _UPC_RED_T *) src + k)); + } + break; + case UPC_NONCOMM_FUNC: + for (i = dst_offset, j = i + THREADS, k = src_offset + THREADS; + k < (src_offset + n_local * THREADS); + i = j, j += THREADS, k += THREADS) + { + *((shared _UPC_RED_T *) dst + j) + = func (*((shared const _UPC_RED_T *) dst + i), + *((shared const _UPC_RED_T *) src + k)); + } + break; + } + + if (MYTHREAD < THREADS - 1) + // Write last prefix to shared array. (This is spurious, + // e.g., sometimes when n_local < blk_size.) + + pref[MYTHREAD + 1] = *((shared const _UPC_RED_T *) dst + + dst_offset + (n_local - 1) * THREADS); + } + + upc_barrier; + + // Now thread THREADS-1 computes prefixes of pref[first_thr..last_thr] + // even if it doesn't contain any elements itself. + + if (MYTHREAD == THREADS - 1) + { + if (w > 0) // (first_thr is always 0 in this case) + + // Write the last prefix computed on the *previous* iteration. + + pref[0] = + *((shared const _UPC_RED_T *) dst + dst_offset - THREADS); + + else // On the first iteration the source thread has no left neighbor. + + ++first_thr; + + // Compute prefixes in pref[first_thr..last_thr]. + + switch (op) + { + case UPC_ADD: + { + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] += pref[i - 1]; + } + break; + case UPC_MULT: + { + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] *= pref[i - 1]; + } + break; +#ifndef _UPC_NONINT_T + // Skip if not integral type, per spec 4.3.1.1 + // (See additional comments in upc_collective.c) + case UPC_AND: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] &= pref[i - 1]; + break; + case UPC_OR: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] |= pref[i - 1]; + break; + case UPC_XOR: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] ^= pref[i - 1]; + break; +#endif // _UPC_NOINT_T + case UPC_LOGAND: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] = pref[i - 1] && pref[i]; + break; + case UPC_LOGOR: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] = pref[i - 1] || pref[i]; + break; + case UPC_MIN: + for (i = first_thr + 1; i <= last_thr; ++i) + if (pref[i - 1] < pref[i]) + pref[i] = pref[i - 1]; + break; + case UPC_MAX: + for (i = first_thr + 1; i <= last_thr; ++i) + if (pref[i - 1] > pref[i]) + pref[i] = pref[i - 1]; + break; + case UPC_FUNC: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] = func (pref[i - 1], pref[i]); + break; + case UPC_NONCOMM_FUNC: + for (i = first_thr + 1; i <= last_thr; ++i) + pref[i] = func (pref[i - 1], pref[i]); + break; + } + } // THREAD-1 is done doing it's special job. + + upc_barrier; + + if ((n_local > 0) && ((w > 0) || (MYTHREAD > src_thr))) + { + // "Add" pref[MYTHREAD] to local prefixes. (On the first + // iteration, only threads beyond src_thr should do this.) + + switch (op) + { + case UPC_ADD: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + { + *((shared _UPC_RED_T *) dst + i) += pref[MYTHREAD]; + } + break; + case UPC_MULT: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + { + *((shared _UPC_RED_T *) dst + i) *= pref[MYTHREAD]; + } + break; +#ifndef _UPC_NONINT_T + // Skip if not integral type, per spec 4.3.1.1 + // (See additional comments in upc_collective.c) + case UPC_AND: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) &= pref[MYTHREAD]; + break; + case UPC_OR: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) |= pref[MYTHREAD]; + break; + case UPC_XOR: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) *= pref[MYTHREAD]; + break; +#endif // _UPC_NOINT_T + case UPC_LOGAND: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) = + *((shared _UPC_RED_T *) dst + i) && pref[MYTHREAD]; + break; + case UPC_LOGOR: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) = + *((shared _UPC_RED_T *) dst + i) || pref[MYTHREAD]; + break; + case UPC_MIN: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + if (pref[MYTHREAD] < *((shared _UPC_RED_T *) dst + i)) + *((shared _UPC_RED_T *) dst + i) = pref[MYTHREAD]; + break; + case UPC_MAX: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + if (pref[MYTHREAD] > *((shared _UPC_RED_T *) dst + i)) + *((shared _UPC_RED_T *) dst + i) = pref[MYTHREAD]; + break; + case UPC_FUNC: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) = + func (pref[MYTHREAD], *((shared _UPC_RED_T *) dst + i)); + break; + case UPC_NONCOMM_FUNC: + for (i = dst_offset; i < (dst_offset + n_local * THREADS); + i += THREADS) + *((shared _UPC_RED_T *) dst + i) = + func (pref[MYTHREAD], *((shared _UPC_RED_T *) dst + i)); + break; + } + } + + if (wraps > w) // Set up for the next iteration. + { + // swap between two halves of pref array + if (w % 2 == 0) + pref = pref + THREADS; + else + pref = pref - THREADS; + + rem_elems -= row_elems; + + if (rem_elems / (blk_size * THREADS) > 0) // next row is "full" + row_elems = blk_size * THREADS; + else + row_elems = rem_elems; + + first_thr = 0; + last_thr = (row_elems - 1) / blk_size; + + n_local = blk_size; + if ((MYTHREAD == last_thr) && (row_elems % blk_size > 0)) + n_local = row_elems % blk_size; + else if (MYTHREAD > last_thr) + n_local = 0; + + // Handle phase > 0 on first iteration + + if ((w == 0) && (MYTHREAD == src_thr)) + { + src_offset -= phase * THREADS; + dst_offset -= phase * THREADS; + } + + // All the preceding work makes this easy: + + src_offset += blk_size * THREADS; + dst_offset += blk_size * THREADS; + } + + } // End of main "wrap" loop + + // Synchronize using barriers in the cases of MYSYNC and ALLSYNC. + + if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode)) + + upc_barrier; + else + // we have to synchronize anyway to free the pref array + upc_barrier; + + if (MYTHREAD == THREADS - 1) + { + if (w % 2 == 0) + pref -= THREADS; /* DOB: be sure we free the original pointer! */ + upc_free (pref); + } +} |