diff options
Diffstat (limited to 'libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c')
-rw-r--r-- | libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c | 895 |
1 files changed, 895 insertions, 0 deletions
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c new file mode 100644 index 00000000000..2394ac8cbd6 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/mode-transitions.c @@ -0,0 +1,895 @@ +/* Miscellaneous test cases for gang/worker/vector mode transitions. */ + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> +#include <math.h> +#include <openacc.h> + + +/* Test basic vector-partitioned mode transitions. */ + +void t1() +{ + int n = 0, arr[32], i; + + for (i = 0; i < 32; i++) + arr[i] = 0; + + #pragma acc parallel copy(n, arr) \ + num_gangs(1) num_workers(1) vector_length(32) + { + int j; + n++; + #pragma acc loop vector + for (j = 0; j < 32; j++) + arr[j]++; + n++; + } + + assert (n == 2); + + for (i = 0; i < 32; i++) + assert (arr[i] == 1); +} + + +/* Test vector-partitioned, gang-partitioned mode. */ + +void t2() +{ + int n[32], arr[1024], i; + + for (i = 0; i < 1024; i++) + arr[i] = 0; + + for (i = 0; i < 32; i++) + n[i] = 0; + + #pragma acc parallel copy(n, arr) \ + num_gangs(32) num_workers(1) vector_length(32) + { + int j, k; + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + + #pragma acc loop gang + for (j = 0; j < 32; j++) + #pragma acc loop vector + for (k = 0; k < 32; k++) + arr[j * 32 + k]++; + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + } + + for (i = 0; i < 32; i++) + assert (n[i] == 2); + + for (i = 0; i < 1024; i++) + assert (arr[i] == 1); +} + + +/* Test conditions inside vector-partitioned loops. */ + +void t4() +{ + int n[32], arr[1024], i; + + for (i = 0; i < 1024; i++) + arr[i] = i; + + for (i = 0; i < 32; i++) + n[i] = 0; + + #pragma acc parallel copy(n, arr) \ + num_gangs(32) num_workers(1) vector_length(32) + { + int j, k; + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + #pragma acc loop vector + for (k = 0; k < 32; k++) + if ((arr[j * 32 + k] % 2) != 0) + arr[j * 32 + k] *= 2; + } + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + } + + for (i = 0; i < 32; i++) + assert (n[i] == 2); + + for (i = 0; i < 1024; i++) + assert (arr[i] == ((i % 2) == 0 ? i : i * 2)); +} + + +/* Test conditions inside gang-partitioned/vector-partitioned loops. */ + +void t5() +{ + int n[32], arr[1024], i; + + for (i = 0; i < 1024; i++) + arr[i] = i; + + for (i = 0; i < 32; i++) + n[i] = 0; + + #pragma acc parallel copy(n, arr) \ + num_gangs(32) num_workers(1) vector_length(32) + { + int j; + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + + #pragma acc loop gang vector + for (j = 0; j < 1024; j++) + if ((arr[j] % 2) != 0) + arr[j] *= 2; + + #pragma acc loop gang(static:*) + for (j = 0; j < 32; j++) + n[j]++; + } + + for (i = 0; i < 32; i++) + assert (n[i] == 2); + + for (i = 0; i < 1024; i++) + assert (arr[i] == ((i % 2) == 0 ? i : i * 2)); +} + + +/* Test trivial operation of vector-single mode. */ + +void t7() +{ + int n = 0; + #pragma acc parallel copy(n) \ + num_gangs(1) num_workers(1) vector_length(32) + { + n++; + } + assert (n == 1); +} + + +/* Test vector-single, gang-partitioned mode. */ + +void t8() +{ + int arr[1024]; + int gangs; + + for (gangs = 1; gangs <= 1024; gangs <<= 1) + { + int i; + + for (i = 0; i < 1024; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(gangs) num_workers(1) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 1024; j++) + arr[j]++; + } + + for (i = 0; i < 1024; i++) + assert (arr[i] == 1); + } +} + + +/* Test conditions in vector-single mode. */ + +void t9() +{ + int arr[1024]; + int gangs; + + for (gangs = 1; gangs <= 1024; gangs <<= 1) + { + int i; + + for (i = 0; i < 1024; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(gangs) num_workers(1) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 1024; j++) + if ((j % 3) == 0) + arr[j]++; + else + arr[j] += 2; + } + + for (i = 0; i < 1024; i++) + assert (arr[i] == ((i % 3) == 0) ? 1 : 2); + } +} + + +/* Test switch in vector-single mode. */ + +void t10() +{ + int arr[1024]; + int gangs; + + for (gangs = 1; gangs <= 1024; gangs <<= 1) + { + int i; + + for (i = 0; i < 1024; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(gangs) num_workers(1) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 1024; j++) + switch (j % 5) + { + case 0: arr[j] += 1; break; + case 1: arr[j] += 2; break; + case 2: arr[j] += 3; break; + case 3: arr[j] += 4; break; + case 4: arr[j] += 5; break; + default: arr[j] += 99; + } + } + + for (i = 0; i < 1024; i++) + assert (arr[i] == (i % 5) + 1); + } +} + + +/* Test switch in vector-single mode, initialise array on device. */ + +void t11() +{ + int arr[1024]; + int i; + + for (i = 0; i < 1024; i++) + arr[i] = 99; + + #pragma acc parallel copy(arr) \ + num_gangs(1024) num_workers(1) vector_length(32) + { + int j; + + /* This loop and the one following must be distributed to available gangs + in the same way to ensure data dependencies are not violated (hence the + "static" clauses). */ + #pragma acc loop gang(static:*) + for (j = 0; j < 1024; j++) + arr[j] = 0; + + #pragma acc loop gang(static:*) + for (j = 0; j < 1024; j++) + switch (j % 5) + { + case 0: arr[j] += 1; break; + case 1: arr[j] += 2; break; + case 2: arr[j] += 3; break; + case 3: arr[j] += 4; break; + case 4: arr[j] += 5; break; + default: arr[j] += 99; + } + } + + for (i = 0; i < 1024; i++) + assert (arr[i] == (i % 5) + 1); +} + + +/* Test multiple conditions in vector-single mode. */ + +#define NUM_GANGS 4096 +void t12() +{ + bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS]; + int i; + + #pragma acc parallel copyout(fizz, buzz, fizzbuzz) \ + num_gangs(NUM_GANGS) num_workers(1) vector_length(32) + { + int j; + + /* This loop and the one following must be distributed to available gangs + in the same way to ensure data dependencies are not violated (hence the + "static" clauses). */ + #pragma acc loop gang(static:*) + for (j = 0; j < NUM_GANGS; j++) + fizz[j] = buzz[j] = fizzbuzz[j] = 0; + + #pragma acc loop gang(static:*) + for (j = 0; j < NUM_GANGS; j++) + { + if ((j % 3) == 0 && (j % 5) == 0) + fizzbuzz[j] = 1; + else + { + if ((j % 3) == 0) + fizz[j] = 1; + else if ((j % 5) == 0) + buzz[j] = 1; + } + } + } + + for (i = 0; i < NUM_GANGS; i++) + { + assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0)); + assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0)); + assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0)); + } +} +#undef NUM_GANGS + + +/* Test worker-partitioned/vector-single mode. */ + +void t13() +{ + int arr[32 * 8], i; + + for (i = 0; i < 32 * 8; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + #pragma acc loop worker + for (k = 0; k < 8; k++) + arr[j * 8 + k] += j * 8 + k; + } + } + + for (i = 0; i < 32 * 8; i++) + assert (arr[i] == i); +} + + +/* Test worker-single/worker-partitioned transitions. */ + +void t16() +{ + int n[32], arr[32 * 32], i; + + for (i = 0; i < 32 * 32; i++) + arr[i] = 0; + + for (i = 0; i < 32; i++) + n[i] = 0; + + #pragma acc parallel copy(n, arr) \ + num_gangs(8) num_workers(16) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + + n[j]++; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr[j * 32 + k]++; + + n[j]++; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr[j * 32 + k]++; + + n[j]++; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr[j * 32 + k]++; + + n[j]++; + } + } + + for (i = 0; i < 32; i++) + assert (n[i] == 4); + + for (i = 0; i < 32 * 32; i++) + assert (arr[i] == 3); +} + + +/* Test correct synchronisation between worker-partitioned loops. */ + +void t17() +{ + int arr_a[32 * 32], arr_b[32 * 32], i; + int num_workers, num_gangs; + + for (num_workers = 1; num_workers <= 32; num_workers <<= 1) + for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1) + { + for (i = 0; i < 32 * 32; i++) + arr_a[i] = i; + + #pragma acc parallel copyin(arr_a) copyout(arr_b) \ + num_gangs(num_gangs) num_workers(num_workers) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2; + } + } + + for (i = 0; i < 32 * 32; i++) + assert (arr_b[i] == (i ^ 31) * 8); + } +} + + +/* Test correct synchronisation between worker+vector-partitioned loops. */ + +void t18() +{ + int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i; + int num_workers, num_gangs; + + for (num_workers = 1; num_workers <= 32; num_workers <<= 1) + for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1) + { + for (i = 0; i < 32 * 32 * 32; i++) + arr_a[i] = i; + + #pragma acc parallel copyin(arr_a) copyout(arr_b) \ + num_gangs(num_gangs) num_workers(num_workers) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + + #pragma acc loop worker vector + for (k = 0; k < 32 * 32; k++) + arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2; + + #pragma acc loop worker vector + for (k = 0; k < 32 * 32; k++) + arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2; + + #pragma acc loop worker vector + for (k = 0; k < 32 * 32; k++) + arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2; + } + } + + for (i = 0; i < 32 * 32 * 32; i++) + assert (arr_b[i] == (i ^ 1023) * 8); + } +} + + +/* Test correct synchronisation between vector-partitioned loops in + worker-partitioned mode. */ + +void t19() +{ + int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i; + int num_workers, num_gangs; + + for (num_workers = 1; num_workers <= 32; num_workers <<= 1) + for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1) + { + for (i = 0; i < 32 * 32 * 32; i++) + arr_a[i] = i; + + for (i = 0; i < 32 * 32; i++) + n[i] = 0; + + #pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \ + num_gangs(num_gangs) num_workers(num_workers) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + + #pragma acc loop worker + for (k = 0; k < 32; k++) + { + int m; + + n[j * 32 + k]++; + + #pragma acc loop vector + for (m = 0; m < 32; m++) + { + if (((j * 1024 + k * 32 + m) % 2) == 0) + arr_b[j * 1024 + k * 32 + (31 - m)] + = arr_a[j * 1024 + k * 32 + m] * 2; + else + arr_b[j * 1024 + k * 32 + (31 - m)] + = arr_a[j * 1024 + k * 32 + m] * 3; + } + + /* Test returning to vector-single mode... */ + n[j * 32 + k]++; + + #pragma acc loop vector + for (m = 0; m < 32; m++) + { + if (((j * 1024 + k * 32 + m) % 3) == 0) + arr_a[j * 1024 + k * 32 + (31 - m)] + = arr_b[j * 1024 + k * 32 + m] * 5; + else + arr_a[j * 1024 + k * 32 + (31 - m)] + = arr_b[j * 1024 + k * 32 + m] * 7; + } + + /* ...and back-to-back vector loops. */ + + #pragma acc loop vector + for (m = 0; m < 32; m++) + { + if (((j * 1024 + k * 32 + m) % 2) == 0) + arr_b[j * 1024 + k * 32 + (31 - m)] + = arr_a[j * 1024 + k * 32 + m] * 3; + else + arr_b[j * 1024 + k * 32 + (31 - m)] + = arr_a[j * 1024 + k * 32 + m] * 2; + } + } + } + } + + for (i = 0; i < 32 * 32; i++) + assert (n[i] == 2); + + for (i = 0; i < 32 * 32 * 32; i++) + { + int m = 6 * ((i % 3) == 0 ? 5 : 7); + assert (arr_b[i] == (i ^ 31) * m); + } + } +} + + +/* With -O0, variables are on the stack, not in registers. Check that worker + state propagation handles the stack frame. */ + +void t20() +{ + int w0 = 0; + int w1 = 0; + int w2 = 0; + int w3 = 0; + int w4 = 0; + int w5 = 0; + int w6 = 0; + int w7 = 0; + + int i; + +#pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \ + num_gangs (1) num_workers (8) + { + int internal = 100; + +#pragma acc loop worker + for (i = 0; i < 8; i++) + { + switch (i) + { + case 0: w0 = internal; break; + case 1: w1 = internal; break; + case 2: w2 = internal; break; + case 3: w3 = internal; break; + case 4: w4 = internal; break; + case 5: w5 = internal; break; + case 6: w6 = internal; break; + case 7: w7 = internal; break; + default: break; + } + } + } + + if (w0 != 100 + || w1 != 100 + || w2 != 100 + || w3 != 100 + || w4 != 100 + || w5 != 100 + || w6 != 100 + || w7 != 100) + __builtin_abort (); +} + + +/* Test worker-single/vector-single mode. */ + +void t21() +{ + int arr[32], i; + + for (i = 0; i < 32; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + arr[j]++; + } + + for (i = 0; i < 32; i++) + assert (arr[i] == 1); +} + + +/* Test worker-single/vector-single mode. */ + +void t22() +{ + int arr[32], i; + + for (i = 0; i < 32; i++) + arr[i] = 0; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + #pragma acc atomic + arr[j]++; + } + } + + for (i = 0; i < 32; i++) + assert (arr[i] == 1); +} + + +/* Test condition in worker-single/vector-single mode. */ + +void t23() +{ + int arr[32], i; + + for (i = 0; i < 32; i++) + arr[i] = i; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + if ((arr[j] % 2) != 0) + arr[j]++; + else + arr[j] += 2; + } + + for (i = 0; i < 32; i++) + assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2); +} + + +/* Test switch in worker-single/vector-single mode. */ + +void t24() +{ + int arr[32], i; + + for (i = 0; i < 32; i++) + arr[i] = i; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + switch (arr[j] % 5) + { + case 0: arr[j] += 1; break; + case 1: arr[j] += 2; break; + case 2: arr[j] += 3; break; + case 3: arr[j] += 4; break; + case 4: arr[j] += 5; break; + default: arr[j] += 99; + } + } + + for (i = 0; i < 32; i++) + assert (arr[i] == i + (i % 5) + 1); +} + + +/* Test worker-single/vector-partitioned mode. */ + +void t25() +{ + int arr[32 * 32], i; + + for (i = 0; i < 32 * 32; i++) + arr[i] = i; + + #pragma acc parallel copy(arr) \ + num_gangs(8) num_workers(8) vector_length(32) + { + int j; + #pragma acc loop gang + for (j = 0; j < 32; j++) + { + int k; + #pragma acc loop vector + for (k = 0; k < 32; k++) + { + #pragma acc atomic + arr[j * 32 + k]++; + } + } + } + + for (i = 0; i < 32 * 32; i++) + assert (arr[i] == i + 1); +} + + +/* Test worker-single, vector-partitioned, gang-redundant mode. */ + +#define ACTUAL_GANGS 8 +void t27() +{ + int n, arr[32], i; + int ondev; + + for (i = 0; i < 32; i++) + arr[i] = 0; + + n = 0; + + #pragma acc parallel copy(n, arr) copyout(ondev) \ + num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32) + { + int j; + + ondev = acc_on_device (acc_device_not_host); + + #pragma acc atomic + n++; + + #pragma acc loop vector + for (j = 0; j < 32; j++) + { + #pragma acc atomic + arr[j] += 1; + } + + #pragma acc atomic + n++; + } + + int m = ondev ? ACTUAL_GANGS : 1; + + assert (n == m * 2); + + for (i = 0; i < 32; i++) + assert (arr[i] == m); +} +#undef ACTUAL_GANGS + + +/* Check if worker-single variables get broadcastd to vectors. */ + +#pragma acc routine +float t28_routine () +{ + return 2.71; +} + +#define N 32 +void t28() +{ + float threads[N], v1 = 3.14; + + for (int i = 0; i < N; i++) + threads[i] = -1; + +#pragma acc parallel num_gangs (1) vector_length (32) copy (v1) + { + float val = t28_routine (); + +#pragma acc loop vector + for (int i = 0; i < N; i++) + threads[i] = val + v1*i; + } + + for (int i = 0; i < N; i++) + assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001); +} +#undef N + + +int main() +{ + t1(); + t2(); + t4(); + t5(); + t7(); + t8(); + t9(); + t10(); + t11(); + t12(); + t13(); + t16(); + t17(); + t18(); + t19(); + t20(); + t21(); + t22(); + t23(); + t24(); + t25(); + t27(); + t28(); + + return 0; +} |