summaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorJonas Hahnfeld <hahnjo@hahnjo.de>2018-09-29 16:02:17 +0000
committerJonas Hahnfeld <hahnjo@hahnjo.de>2018-09-29 16:02:17 +0000
commit452d157626274a344571ba7821ecbf2d592786fd (patch)
tree473f10ec723a37ac2832ee8ac6b7b7001738af22 /openmp
parentee0c0e5d097a454648d395ac5ef42891a5e53dc2 (diff)
[libomptarget-nvptx] Fix number of threads in parallel
If there is no num_threads() clause we must consider the nthreads-var ICV. Its value is set by omp_set_num_threads() and can be queried using omp_get_max_num_threads(). The rewritten code now closely resembles the algorithm given in the OpenMP standard. Differential Revision: https://reviews.llvm.org/D51783
Diffstat (limited to 'openmp')
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu4
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu125
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c102
3 files changed, 147 insertions, 84 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index f2599a4e9c2..929f9db9962 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -61,8 +61,8 @@ EXTERN int omp_get_max_threads(void) {
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
int rc = 1; // default is 1 thread avail
if (!currTaskDescr->InParallelRegion()) {
- // not currently in a parallel region... all are available
- rc = GetNumberOfProcsInTeam();
+ // Not currently in a parallel region, return what was set.
+ rc = currTaskDescr->NThreads();
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
}
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index f0ba41bd18e..13e64e44ac5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -193,25 +193,38 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
-// return number of cuda threads that participate to parallel
-// calculation has to consider simd implementation in nvptx
-// i.e. (num omp threads * num lanes)
-//
-// cudathreads =
-// if(num_threads != 0) {
-// if(thread_limit > 0) {
-// min (num_threads*numLanes ; thread_limit*numLanes);
-// } else {
-// min (num_threads*numLanes; blockDim.x)
-// }
-// } else {
-// if (thread_limit != 0) {
-// min (thread_limit*numLanes; blockDim.x)
-// } else { // no thread_limit, no num_threads, use all cuda threads
-// blockDim.x;
-// }
-// }
-//
+static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
+ uint16_t NThreadsICV,
+ uint16_t ThreadLimit) {
+ uint16_t ThreadsRequested = NThreadsICV;
+ if (NumThreadsClause != 0) {
+ ThreadsRequested = NumThreadsClause;
+ }
+
+ uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
+ if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
+ ThreadsAvailable = ThreadLimit;
+ }
+
+ uint16_t NumThreads = ThreadsAvailable;
+ if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
+ NumThreads = ThreadsRequested;
+ }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ // On Volta and newer architectures we require that all lanes in
+ // a warp participate in the parallel region. Round down to a
+ // multiple of WARPSIZE since it is legal to do so in OpenMP.
+ if (NumThreads < WARPSIZE) {
+ NumThreads = 1;
+ } else {
+ NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
+ }
+#endif
+
+ return NumThreads;
+}
+
// This routine is always called by the team master..
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized) {
@@ -234,78 +247,26 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
return;
}
- uint16_t CudaThreadsForParallel = 0;
- uint16_t NumThreadsClause =
+ uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
- // we cannot have more than block size
- uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
-
- // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
- // specified by the thread_limit clause on the target directive.
- // GetNumberOfWorkersInTeam(): This is the number of workers available
- // in this kernel instance.
- //
- // E.g: If thread_limit is 33, the kernel is launched with 33+32=65
- // threads. The last warp is the master warp so in this case
- // GetNumberOfWorkersInTeam() returns 64.
-
- // this is different from ThreadAvail of OpenMP because we may be
- // using some of the CUDA threads as SIMD lanes
- int NumLanes = 1;
- if (NumThreadsClause != 0) {
- // reset request to avoid propagating to successive #parallel
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
- 0;
-
- // assume that thread_limit*numlanes is already <= CudaThreadsAvail
- // because that is already checked on the host side (CUDA offloading rtl)
- if (currTaskDescr->ThreadLimit() != 0)
- CudaThreadsForParallel =
- NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
- ? NumThreadsClause * NumLanes
- : currTaskDescr->ThreadLimit() * NumLanes;
- else {
- CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
- ? CudaThreadsAvail
- : NumThreadsClause * NumLanes;
- }
- } else {
- if (currTaskDescr->ThreadLimit() != 0) {
- CudaThreadsForParallel =
- (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
- ? CudaThreadsAvail
- : currTaskDescr->ThreadLimit() * NumLanes;
- } else
- CudaThreadsForParallel = CudaThreadsAvail;
- }
+ uint16_t NumThreads =
+ determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(),
+ currTaskDescr->ThreadLimit());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp participate in the parallel region. Round down to a
- // multiple of WARPSIZE since it is legal to do so in OpenMP.
- // CudaThreadsAvail is the number of workers available in this
- // kernel instance and is greater than or equal to
- // currTaskDescr->ThreadLimit().
- if (CudaThreadsForParallel < CudaThreadsAvail) {
- CudaThreadsForParallel =
- (CudaThreadsForParallel < WARPSIZE)
- ? 1
- : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
+ if (NumThreadsClause != 0) {
+ // Reset request to avoid propagating to successive #parallel
+ NumThreadsClause = 0;
}
-#endif
- ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
- "bad thread request of %d threads", CudaThreadsForParallel);
+ ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+ NumThreads);
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"only team master can create parallel");
- // set number of threads on work descriptor
- // this is different from the number of cuda threads required for the parallel
- // region
+ // Set number of threads on work descriptor.
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
- workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
- CudaThreadsForParallel / NumLanes);
+ workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads);
}
// All workers call this function. Deactivate those not needed.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
new file mode 100644
index 00000000000..4a2f73fee82
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
@@ -0,0 +1,102 @@
+// RUN: %compile-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+const int WarpSize = 32;
+const int NumThreads1 = 1 * WarpSize;
+const int NumThreads2 = 2 * WarpSize;
+const int NumThreads3 = 3 * WarpSize;
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+ int check1[MaxThreads];
+ int check2[MaxThreads];
+ int check3[MaxThreads];
+ int check4[MaxThreads];
+ for (int i = 0; i < MaxThreads; i++) {
+ check1[i] = check2[i] = check3[i] = check4[i] = 0;
+ }
+
+ int maxThreads1 = -1;
+ int maxThreads2 = -1;
+ int maxThreads3 = -1;
+
+ #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
+ map(maxThreads1, maxThreads2, maxThreads3)
+ {
+ #pragma omp parallel num_threads(NumThreads1)
+ {
+ check1[omp_get_thread_num()] += omp_get_num_threads();
+ }
+
+ // API method to set number of threads in parallel regions without
+ // num_threads() clause.
+ omp_set_num_threads(NumThreads2);
+ maxThreads1 = omp_get_max_threads();
+ #pragma omp parallel
+ {
+ check2[omp_get_thread_num()] += omp_get_num_threads();
+ }
+
+ maxThreads2 = omp_get_max_threads();
+
+ // num_threads() clause should override nthreads-var ICV.
+ #pragma omp parallel num_threads(NumThreads3)
+ {
+ check3[omp_get_thread_num()] += omp_get_num_threads();
+ }
+
+ maxThreads3 = omp_get_max_threads();
+
+ // Effect from omp_set_num_threads() should still be visible.
+ #pragma omp parallel
+ {
+ check4[omp_get_thread_num()] += omp_get_num_threads();
+ }
+ }
+
+ // CHECK: maxThreads1 = 64
+ printf("maxThreads1 = %d\n", maxThreads1);
+ // CHECK: maxThreads2 = 64
+ printf("maxThreads2 = %d\n", maxThreads2);
+ // CHECK: maxThreads3 = 64
+ printf("maxThreads3 = %d\n", maxThreads3);
+
+ // CHECK-NOT: invalid
+ for (int i = 0; i < MaxThreads; i++) {
+ if (i < NumThreads1) {
+ if (check1[i] != NumThreads1) {
+ printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
+ }
+ } else if (check1[i] != 0) {
+ printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+ }
+
+ if (i < NumThreads2) {
+ if (check2[i] != NumThreads2) {
+ printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
+ }
+ } else if (check2[i] != 0) {
+ printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+ }
+
+ if (i < NumThreads3) {
+ if (check3[i] != NumThreads3) {
+ printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
+ }
+ } else if (check3[i] != 0) {
+ printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+ }
+
+ if (i < NumThreads2) {
+ if (check4[i] != NumThreads2) {
+ printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
+ }
+ } else if (check4[i] != 0) {
+ printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
+ }
+ }
+
+ return 0;
+}