aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChung-Lin Tang <cltang@codesourcery.com>2017-07-25 15:02:26 +0000
committerChung-Lin Tang <cltang@codesourcery.com>2017-07-25 15:02:26 +0000
commitc0e29b8f412cd1ba87389bf9479429ff2c9b28d5 (patch)
tree295d6ba9799ac4875cc665b65db86e02006a7383
parent0f58e536303c26327154e8e4f9c52cf3bbbd30ab (diff)
2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
gcc/c/ * c-parser.c (c_parser_oacc_clause_wait): Add representation of wait clause without argument as 'wait (GOMP_ASYNC_NOVAL)', adjust comments. gcc/cp/ * parser.c (cp_parser_oacc_clause_wait): Add representation of wait clause without argument as 'wait (GOMP_ASYNC_NOVAL)', adjust comments. gcc/fortran/ * trans-openmp.c (gfc_trans_omp_clauses_1): Add representation of wait clause without argument as 'wait (GOMP_ASYNC_NOVAL)'. gcc/ * omp-low.c (expand_omp_target): Add middle-end support for handling OMP_CLAUSE_WAIT clause with a GOMP_ASYNC_NOVAL(-1) as the argument. gcc/testsuite/ * c-c++-common/goacc/dtype-1.c: Adjust testcase. * gfortran.dg/goacc/dtype-1.f95: Likewise. include/ * gomp-constants.h (GOMP_LAUNCH_OP_MASK): Define. (GOMP_LAUNCH_PACK): Add bitwise-and of GOMP_LAUNCH_OP_MASK. (GOMP_LAUNCH_OP): Likewise. libgomp/ * libgomp-plugin.h (struct goacc_asyncqueue): Declare. (struct goacc_asyncqueue_list): Likewise. (goacc_aq): Likewise. (goacc_aq_list): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. (GOMP_OFFLOAD_openacc_async_test): Remove. (GOMP_OFFLOAD_openacc_async_test_all): Remove. (GOMP_OFFLOAD_openacc_async_wait): Remove. (GOMP_OFFLOAD_openacc_async_wait_async): Remove. (GOMP_OFFLOAD_openacc_async_wait_all): Remove. (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. (GOMP_OFFLOAD_openacc_async_set_async): Remove. (GOMP_OFFLOAD_openacc_exec): Adjust declaration. (GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise. (GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise. (GOMP_OFFLOAD_openacc_async_exec): Declare. (GOMP_OFFLOAD_openacc_async_construct): Declare. (GOMP_OFFLOAD_openacc_async_destruct): Declare. (GOMP_OFFLOAD_openacc_async_test): Declare. (GOMP_OFFLOAD_openacc_async_synchronize): Declare. (GOMP_OFFLOAD_openacc_async_serialize): Declare. (GOMP_OFFLOAD_openacc_async_queue_callback): Declare. (GOMP_OFFLOAD_openacc_async_host2dev): Declare. (GOMP_OFFLOAD_openacc_async_dev2host): Declare. * libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct. (gomp_acc_insert_pointer): Adjust declaration. (gomp_copy_host2dev): New declaration. (gomp_copy_dev2host): Likewise. (gomp_map_vars_async): Likewise. (gomp_unmap_tgt): Likewise. (gomp_unmap_vars_async): Likewise. (gomp_fini_device): Likewise. * oacc-async.c (get_goacc_thread): New function. (get_goacc_thread_device): New function. (lookup_goacc_asyncqueue): New function. (get_goacc_asyncqueue): New function. (acc_async_test): Adjust code to use new async design. (acc_async_test_all): Likewise. (acc_wait): Likewise. (acc_wait_async): Likewise. * oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async design. (acc_set_cuda_stream): Likewise. * oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'. (host_openacc_register_async_cleanup): Remove. (host_openacc_async_exec): New function. (host_openacc_async_test): Adjust parameters. (host_openacc_async_test_all): Remove. (host_openacc_async_wait): Remove. (host_openacc_async_wait_async): Remove. (host_openacc_async_wait_all): Remove. (host_openacc_async_wait_all_async): Remove. (host_openacc_async_set_async): Remove. (host_openacc_async_synchronize): New function. (host_openacc_async_serialize): New function. (host_openacc_async_host2dev): New function. (host_openacc_async_dev2host): New function. (host_openacc_async_queue_callback): New function. (host_openacc_async_construct): New function. (host_openacc_async_destruct): New function. (struct gomp_device_descr host_dispatch): Remove initialization of old interface, add intialization of new async sub-struct. * oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device. (goacc_attach_host_thread_to_device): Remove old async code usage. * oacc-int.h (goacc_init_asyncqueues): New declaration. (goacc_fini_asyncqueues): Likewise. (goacc_async_copyout_unmap_vars): Likewise. (goacc_async_free): Likewise. (get_goacc_asyncqueue): Likewise. (lookup_goacc_asyncqueue): Likewise. * oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async design. (acc_is_present): Explicitly use 1/0 as return value; (present_create_copy): Adjust code to use new async design. (delete_copyout): Likewise. (update_dev_host): Likewise. (gomp_acc_insert_pointer): Add async parameter, adjust code to use new async design. (gomp_acc_remove_pointer): Adjust code to use new async design. * oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async design, adjust profiling bits, interpret launch op as signed 16-bit field. (GOACC_enter_exit_data): Handle -1 as waits num, adjust code to use new async design. (goacc_wait): Adjust code to use new async design. (GOACC_update): Likewise. * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Remove. * target.c (goacc_device_copy_async): New function. (gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter, add goacc_device_copy_async case. (gomp_copy_dev2host): Likewise. (gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code. (gomp_map_pointer): Likewise. (gomp_map_fields_existing): Likewise. (gomp_map_vars): Add function for compatiblity. (gomp_map_vars_async): Adapt from gomp_map_vars, add goacc_asyncqueue parameter. (gomp_unmap_tgt): Remove statis, add attribute_hidden. (gomp_unmap_vars): Add function for compatiblity. (gomp_unmap_vars_async): Adapt from gomp_unmap_vars, add goacc_asyncqueue parameter. (gomp_fini_device): New function. (gomp_exit_data): Adjust gomp_copy_dev2host call. (gomp_load_plugin_for_device): Remove old interface, adjust to load new async interface. (gomp_target_fini): Adjust code to call gomp_fini_device. * plugin/plugin-nvptx.c (struct cuda_map): Remove. (struct ptx_stream): Remove. (struct nvptx_thread): Remove current_stream field. (cuda_map_create): Remove. (cuda_map_destroy): Remove. (map_init): Remove. (map_fini): Remove. (map_pop): Remove. (map_push): Remove. (struct goacc_asyncqueue): Define. (struct nvptx_callback): Define. (struct ptx_free_block): Define. (struct ptx_device): Remove null_stream, active_streams, async_streams, stream_lock, and next fields. (enum ptx_event_type): Remove. (struct ptx_event): Remove. (ptx_event_lock): Remove. (ptx_events): Remove. (init_streams_for_device): Remove. (fini_streams_for_device): Remove. (select_stream_for_async): Remove. (nvptx_init): Remove ptx_events and ptx_event_lock references. (nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED case. (nvptx_open_device): Add free_blocks initialization, remove init_streams_for_device call. (nvptx_close_device): Remove fini_streams_for_device call, add free_blocks destruct code. (event_gc): Remove. (event_add): Remove. (nvptx_exec): Adjust parameters and code. (nvptx_free): Likewise. (nvptx_host2dev): Remove. (nvptx_dev2host): Remove. (nvptx_set_async): Remove. (nvptx_async_test): Remove. (nvptx_async_test_all): Remove. (nvptx_wait): Remove. (nvptx_wait_async): Remove. (nvptx_wait_all): Remove. (nvptx_wait_all_async): Remove. (nvptx_get_cuda_stream): Remove. (nvptx_set_cuda_stream): Remove. (GOMP_OFFLOAD_alloc): Adjust code. (GOMP_OFFLOAD_free): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove. (GOMP_OFFLOAD_openacc_exec): Adjust parameters and code. (GOMP_OFFLOAD_openacc_async_test_all): Remove. (GOMP_OFFLOAD_openacc_async_wait): Remove. (GOMP_OFFLOAD_openacc_async_wait_async): Remove. (GOMP_OFFLOAD_openacc_async_wait_all): Remove. (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove. (GOMP_OFFLOAD_openacc_async_set_async): Remove. (cuda_free_argmem): New function. (GOMP_OFFLOAD_openacc_async_exec): New plugin hook function. (GOMP_OFFLOAD_openacc_create_thread_data): Adjust code. (GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code. (GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code. (GOMP_OFFLOAD_openacc_async_construct): New plugin hook function. (GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function. (GOMP_OFFLOAD_openacc_async_test): Remove and re-implement. (GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function. (GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function. (GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function. (cuda_callback_wrapper): New function. (cuda_memcpy_sanity_check): New function. (GOMP_OFFLOAD_host2dev): Remove and re-implement. (GOMP_OFFLOAD_dev2host): Remove and re-implement. (GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function. (GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function. * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c (cb_enter_data_start): Adjust testcase. * testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Adjust testcase. * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-71.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise. * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise. git-svn-id: https://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@250528 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/ChangeLog.gomp5
-rw-r--r--gcc/c/ChangeLog.gomp5
-rw-r--r--gcc/c/c-parser.c10
-rw-r--r--gcc/cp/ChangeLog.gomp5
-rw-r--r--gcc/cp/parser.c16
-rw-r--r--gcc/fortran/ChangeLog.gomp5
-rw-r--r--gcc/fortran/trans-openmp.c7
-rw-r--r--gcc/omp-low.c16
-rw-r--r--gcc/testsuite/ChangeLog.gomp5
-rw-r--r--gcc/testsuite/c-c++-common/goacc/dtype-1.c6
-rw-r--r--gcc/testsuite/gfortran.dg/goacc/dtype-1.f954
-rw-r--r--include/ChangeLog.gomp6
-rw-r--r--include/gomp-constants.h5
-rw-r--r--libgomp/ChangeLog.gomp189
-rw-r--r--libgomp/libgomp-plugin.h45
-rw-r--r--libgomp/libgomp.h48
-rw-r--r--libgomp/oacc-async.c203
-rw-r--r--libgomp/oacc-cuda.c18
-rw-r--r--libgomp/oacc-host.c83
-rw-r--r--libgomp/oacc-init.c4
-rw-r--r--libgomp/oacc-int.h9
-rw-r--r--libgomp/oacc-mem.c69
-rw-r--r--libgomp/oacc-parallel.c104
-rw-r--r--libgomp/oacc-plugin.c11
-rw-r--r--libgomp/plugin/plugin-nvptx.c1644
-rw-r--r--libgomp/target.c171
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c5
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c2
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c2
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c2
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c16
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c11
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c5
-rw-r--r--libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c2
-rw-r--r--libgomp/testsuite/libgomp.oacc-fortran/lib-12.f901
35 files changed, 1182 insertions, 1557 deletions
diff --git a/gcc/ChangeLog.gomp b/gcc/ChangeLog.gomp
index 301cc83f52f..33da0ec34b7 100644
--- a/gcc/ChangeLog.gomp
+++ b/gcc/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * omp-low.c (expand_omp_target): Add middle-end support for handling
+ OMP_CLAUSE_WAIT clause with a GOMP_ASYNC_NOVAL(-1) as the argument.
+
2017-07-19 Tom de Vries <tom@codesourcery.com>
backport from mainline:
diff --git a/gcc/c/ChangeLog.gomp b/gcc/c/ChangeLog.gomp
index c70003f4e03..679964aa3f7 100644
--- a/gcc/c/ChangeLog.gomp
+++ b/gcc/c/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * c-parser.c (c_parser_oacc_clause_wait): Add representation of wait
+ clause without argument as 'wait (GOMP_ASYNC_NOVAL)', adjust comments.
+
2017-05-17 Thomas Schwinge <thomas@codesourcery.com>
* c-parser.c (c_parser_oacc_data_clause)
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 34f8b176cc1..1873aa43b9b 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11941,7 +11941,7 @@ c_parser_oacc_clause_tile (c_parser *parser, tree list)
}
/* OpenACC:
- wait ( int-expr-list ) */
+ wait [( int-expr-list )] */
static tree
c_parser_oacc_clause_wait (c_parser *parser, tree list)
@@ -11950,7 +11950,15 @@ c_parser_oacc_clause_wait (c_parser *parser, tree list)
if (c_parser_peek_token (parser)->type == CPP_OPEN_PAREN)
list = c_parser_oacc_wait_list (parser, clause_loc, list);
+ else
+ {
+ tree c = build_omp_clause (clause_loc, OMP_CLAUSE_WAIT);
+ OMP_CLAUSE_DECL (c) = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+ OMP_CLAUSE_CHAIN (c) = list;
+ list = c;
+ }
+
return list;
}
diff --git a/gcc/cp/ChangeLog.gomp b/gcc/cp/ChangeLog.gomp
index b0c3dbf9693..3ae1f21f439 100644
--- a/gcc/cp/ChangeLog.gomp
+++ b/gcc/cp/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * parser.c (cp_parser_oacc_clause_wait): Add representation of wait
+ clause without argument as 'wait (GOMP_ASYNC_NOVAL)', adjust comments.
+
2017-05-17 Thomas Schwinge <thomas@codesourcery.com>
* pt.c (tsubst_omp_clauses): Handle "OMP_CLAUSE_FINALIZE".
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index cbb11d022fe..49c530653a8 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -30619,17 +30619,23 @@ cp_parser_oacc_wait_list (cp_parser *parser, location_t clause_loc, tree list)
}
/* OpenACC:
- wait ( int-expr-list ) */
+ wait [( int-expr-list )] */
static tree
cp_parser_oacc_clause_wait (cp_parser *parser, tree list)
{
location_t location = cp_lexer_peek_token (parser->lexer)->location;
- if (cp_lexer_peek_token (parser->lexer)->type != CPP_OPEN_PAREN)
- return list;
-
- list = cp_parser_oacc_wait_list (parser, location, list);
+ if (cp_lexer_peek_token (parser->lexer)->type == CPP_OPEN_PAREN)
+ list = cp_parser_oacc_wait_list (parser, location, list);
+ else
+ {
+ tree c = build_omp_clause (location, OMP_CLAUSE_WAIT);
+
+ OMP_CLAUSE_DECL (c) = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+ OMP_CLAUSE_CHAIN (c) = list;
+ list = c;
+ }
return list;
}
diff --git a/gcc/fortran/ChangeLog.gomp b/gcc/fortran/ChangeLog.gomp
index 37803095a52..221c292ac4f 100644
--- a/gcc/fortran/ChangeLog.gomp
+++ b/gcc/fortran/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * trans-openmp.c (gfc_trans_omp_clauses_1): Add representation of wait
+ clause without argument as 'wait (GOMP_ASYNC_NOVAL)'.
+
2017-06-29 Cesar Philippidis <cesar@codesourcery.com>
Backport from trunk:
diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index ca41903b68f..8018053e63c 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -2962,6 +2962,13 @@ gfc_trans_omp_clauses_1 (stmtblock_t *block, gfc_omp_clauses *clauses,
omp_clauses = c;
}
}
+ else if (clauses->wait)
+ {
+ c = build_omp_clause (where.lb->location, OMP_CLAUSE_WAIT);
+ OMP_CLAUSE_DECL (c) = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+ OMP_CLAUSE_CHAIN (c) = omp_clauses;
+ omp_clauses = c;
+ }
if (clauses->num_gangs_expr)
{
tree num_gangs_var
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 048d9fbabd4..929de45da6b 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -14226,16 +14226,30 @@ expand_omp_target (struct omp_region *region)
/* ... push a placeholder. */
args.safe_push (integer_zero_node);
+ bool noval_seen = false;
+ tree noval = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
+
for (; c; c = OMP_CLAUSE_CHAIN (c))
if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WAIT)
{
+ if (tree_int_cst_compare (OMP_CLAUSE_WAIT_EXPR (c), noval) == 0)
+ {
+ noval_seen = true;
+ continue;
+ }
+
args.safe_push (fold_convert_loc (OMP_CLAUSE_LOCATION (c),
integer_type_node,
OMP_CLAUSE_WAIT_EXPR (c)));
num_waits++;
}
- if (!tagging || num_waits)
+ if (noval_seen && num_waits == 0)
+ args[t_wait_idx] =
+ (tagging
+ ? oacc_launch_pack (GOMP_LAUNCH_WAIT, NULL_TREE, GOMP_ASYNC_NOVAL)
+ : noval);
+ else if (!tagging || num_waits)
{
tree len;
diff --git a/gcc/testsuite/ChangeLog.gomp b/gcc/testsuite/ChangeLog.gomp
index b39cbf39ae5..55573bb6552 100644
--- a/gcc/testsuite/ChangeLog.gomp
+++ b/gcc/testsuite/ChangeLog.gomp
@@ -1,3 +1,8 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * c-c++-common/goacc/dtype-1.c: Adjust testcase.
+ * gfortran.dg/goacc/dtype-1.f95: Likewise.
+
2017-07-19 Tom de Vries <tom@codesourcery.com>
backport from mainline:
diff --git a/gcc/testsuite/c-c++-common/goacc/dtype-1.c b/gcc/testsuite/c-c++-common/goacc/dtype-1.c
index d1337669c11..6dd6ebd8ae1 100644
--- a/gcc/testsuite/c-c++-common/goacc/dtype-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/dtype-1.c
@@ -96,11 +96,11 @@ test ()
/* { dg-final { scan-tree-dump-times "oacc_parallel device_type\\(\\*\\) \\\[ wait\\(10\\) vector_length\\(10\\) num_workers\\(10\\) num_gangs\\(10\\) async\\(10\\) \\\] device_type\\(nvidia\\) \\\[ wait\\(3\\) vector_length\\(128\\) num_workers\\(300\\) num_gangs\\(300\\) async\\(3\\) \\\] wait\\(1\\) vector_length\\(1\\) num_workers\\(1\\) num_gangs\\(1\\) async\\(1\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ async\\(-1\\) \\\]" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ wait\\(-1\\) async\\(-1\\) \\\]" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ wait\\(1\\) async\\(1\\) \\\] async\\(-1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ wait\\(1\\) async\\(1\\) \\\] wait\\(-1\\) async\\(-1\\)" 1 "omplower" } } */
-/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ wait\\(0\\) async\\(0\\) \\\] device_type\\(nvidia\\) \\\[ wait\\(2\\) async\\(2\\) \\\] async\\(-1\\)" 1 "omplower" } } */
+/* { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ wait\\(0\\) async\\(0\\) \\\] device_type\\(nvidia\\) \\\[ wait\\(2\\) async\\(2\\) \\\] wait\\(-1\\) async\\(-1\\)" 1 "omplower" } } */
/* { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\.0\\) private\\(i1\\)" 1 "omplower" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95 b/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
index f24b60fe35a..460922a35df 100644
--- a/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
@@ -175,13 +175,13 @@ end subroutine sr5b
! { dg-final { scan-tree-dump-times "oacc_parallel device_type\\(\\*\\) \\\[ async\\(10\\) wait\\(10\\) num_gangs\\(10\\) num_workers\\(10\\) vector_length\\(10\\) \\\] device_type\\(nvidia_ptx\\) \\\[ async\\(3\\) wait\\(3\\) num_gangs\\(300\\) num_workers\\(300\\) vector_length\\(128\\) \\\] async\\(1\\) wait\\(1\\) num_gangs\\(1\\) num_workers\\(1\\) vector_length\\(1\\)" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ async\\(-1\\) \\\]" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ async\\(-1\\) wait\\(-1\\) \\\]" 1 "omplower" } }
! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(nvidia\\) \\\[ async\\(1\\) wait\\(1\\) \\\]" 1 "omplower" } }
! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ async\\(0\\) wait\\(0\\) \\\] device_type\\(nvidia\\) \\\[ async\\(2\\) wait\\(2\\) \\\]" 1 "omplower" } }
-! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ async\\(0\\) wait\\(0\\) \\\] device_type\\(nvidia_ptx\\) \\\[ async\\(1\\) wait\\(1\\) \\\] async\\(-1\\)" 1 "omplower" } }
+! { dg-final { scan-tree-dump-times "oacc_kernels device_type\\(\\*\\) \\\[ async\\(0\\) wait\\(0\\) \\\] device_type\\(nvidia_ptx\\) \\\[ async\\(1\\) wait\\(1\\) \\\] async\\(-1\\) wait\\(-1\\)" 1 "omplower" } }
! { dg-final { scan-tree-dump-times "acc loop device_type\\(nvidia\\) \\\[ tile\\(1\\) gang \\\] private\\(i1\\) private\\(i1\\.1\\)" 1 "omplower" } }
diff --git a/include/ChangeLog.gomp b/include/ChangeLog.gomp
index b78fa5cc9c9..a2d792a6f4f 100644
--- a/include/ChangeLog.gomp
+++ b/include/ChangeLog.gomp
@@ -1,3 +1,9 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * gomp-constants.h (GOMP_LAUNCH_OP_MASK): Define.
+ (GOMP_LAUNCH_PACK): Add bitwise-and of GOMP_LAUNCH_OP_MASK.
+ (GOMP_LAUNCH_OP): Likewise.
+
2017-04-05 Cesar Philippidis <cesar@codesourcery.com>
* gomp-constants.h (enum gomp_map_kind): Define GOMP_MAP_DECLARE,
diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index 8f17f7817b6..00a0da9f6e0 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -249,13 +249,14 @@ enum gomp_map_kind
#define GOMP_LAUNCH_CODE_SHIFT 28
#define GOMP_LAUNCH_DEVICE_SHIFT 16
#define GOMP_LAUNCH_OP_SHIFT 0
+#define GOMP_LAUNCH_OP_MASK 0xffff
#define GOMP_LAUNCH_PACK(CODE,DEVICE,OP) \
(((CODE) << GOMP_LAUNCH_CODE_SHIFT) \
| ((DEVICE) << GOMP_LAUNCH_DEVICE_SHIFT) \
- | ((OP) << GOMP_LAUNCH_OP_SHIFT))
+ | (((OP) & GOMP_LAUNCH_OP_MASK) << GOMP_LAUNCH_OP_SHIFT))
#define GOMP_LAUNCH_CODE(X) (((X) >> GOMP_LAUNCH_CODE_SHIFT) & 0xf)
#define GOMP_LAUNCH_DEVICE(X) (((X) >> GOMP_LAUNCH_DEVICE_SHIFT) & 0xfff)
-#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff)
+#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & GOMP_LAUNCH_OP_MASK)
#define GOMP_LAUNCH_OP_MAX 0xffff
/* Bitmask to apply in order to find out the intended device of a target
diff --git a/libgomp/ChangeLog.gomp b/libgomp/ChangeLog.gomp
index 9bbc122da6e..a5d1f031157 100644
--- a/libgomp/ChangeLog.gomp
+++ b/libgomp/ChangeLog.gomp
@@ -1,3 +1,192 @@
+2017-07-25 Chung-Lin Tang <cltang@codesourcery.com>
+
+ * libgomp-plugin.h (struct goacc_asyncqueue): Declare.
+ (struct goacc_asyncqueue_list): Likewise.
+ (goacc_aq): Likewise.
+ (goacc_aq_list): Likewise.
+ (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
+ (GOMP_OFFLOAD_openacc_async_test): Remove.
+ (GOMP_OFFLOAD_openacc_async_test_all): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_async): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_all): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
+ (GOMP_OFFLOAD_openacc_async_set_async): Remove.
+ (GOMP_OFFLOAD_openacc_exec): Adjust declaration.
+ (GOMP_OFFLOAD_openacc_cuda_get_stream): Likewise.
+ (GOMP_OFFLOAD_openacc_cuda_set_stream): Likewise.
+ (GOMP_OFFLOAD_openacc_async_exec): Declare.
+ (GOMP_OFFLOAD_openacc_async_construct): Declare.
+ (GOMP_OFFLOAD_openacc_async_destruct): Declare.
+ (GOMP_OFFLOAD_openacc_async_test): Declare.
+ (GOMP_OFFLOAD_openacc_async_synchronize): Declare.
+ (GOMP_OFFLOAD_openacc_async_serialize): Declare.
+ (GOMP_OFFLOAD_openacc_async_queue_callback): Declare.
+ (GOMP_OFFLOAD_openacc_async_host2dev): Declare.
+ (GOMP_OFFLOAD_openacc_async_dev2host): Declare.
+ * libgomp.h (struct acc_dispatch_t): Define 'async' sub-struct.
+ (gomp_acc_insert_pointer): Adjust declaration.
+ (gomp_copy_host2dev): New declaration.
+ (gomp_copy_dev2host): Likewise.
+ (gomp_map_vars_async): Likewise.
+ (gomp_unmap_tgt): Likewise.
+ (gomp_unmap_vars_async): Likewise.
+ (gomp_fini_device): Likewise.
+ * oacc-async.c (get_goacc_thread): New function.
+ (get_goacc_thread_device): New function.
+ (lookup_goacc_asyncqueue): New function.
+ (get_goacc_asyncqueue): New function.
+ (acc_async_test): Adjust code to use new async design.
+ (acc_async_test_all): Likewise.
+ (acc_wait): Likewise.
+ (acc_wait_async): Likewise.
+ * oacc-cuda.c (acc_get_cuda_stream): Adjust code to use new async
+ design.
+ (acc_set_cuda_stream): Likewise.
+ * oacc-host.c (host_openacc_exec): Adjust parameters, remove 'async'.
+ (host_openacc_register_async_cleanup): Remove.
+ (host_openacc_async_exec): New function.
+ (host_openacc_async_test): Adjust parameters.
+ (host_openacc_async_test_all): Remove.
+ (host_openacc_async_wait): Remove.
+ (host_openacc_async_wait_async): Remove.
+ (host_openacc_async_wait_all): Remove.
+ (host_openacc_async_wait_all_async): Remove.
+ (host_openacc_async_set_async): Remove.
+ (host_openacc_async_synchronize): New function.
+ (host_openacc_async_serialize): New function.
+ (host_openacc_async_host2dev): New function.
+ (host_openacc_async_dev2host): New function.
+ (host_openacc_async_queue_callback): New function.
+ (host_openacc_async_construct): New function.
+ (host_openacc_async_destruct): New function.
+ (struct gomp_device_descr host_dispatch): Remove initialization of old
+ interface, add intialization of new async sub-struct.
+ * oacc-init.c (acc_shutdown_1): Adjust to use gomp_fini_device.
+ (goacc_attach_host_thread_to_device): Remove old async code usage.
+ * oacc-int.h (goacc_init_asyncqueues): New declaration.
+ (goacc_fini_asyncqueues): Likewise.
+ (goacc_async_copyout_unmap_vars): Likewise.
+ (goacc_async_free): Likewise.
+ (get_goacc_asyncqueue): Likewise.
+ (lookup_goacc_asyncqueue): Likewise.
+ * oacc-mem.c (memcpy_tofrom_device): Adjust code to use new async
+ design.
+ (acc_is_present): Explicitly use 1/0 as return value;
+ (present_create_copy): Adjust code to use new async design.
+ (delete_copyout): Likewise.
+ (update_dev_host): Likewise.
+ (gomp_acc_insert_pointer): Add async parameter, adjust code to use new
+ async design.
+ (gomp_acc_remove_pointer): Adjust code to use new async design.
+ * oacc-parallel.c (GOACC_parallel_keyed): Adjust code to use new async
+ design, adjust profiling bits, interpret launch op as signed 16-bit
+ field.
+ (GOACC_enter_exit_data): Handle -1 as waits num, adjust code to use new
+ async design.
+ (goacc_wait): Adjust code to use new async design.
+ (GOACC_update): Likewise.
+ * oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Remove.
+ * target.c (goacc_device_copy_async): New function.
+ (gomp_copy_host2dev): Remove 'static', add goacc_asyncqueue parameter,
+ add goacc_device_copy_async case.
+ (gomp_copy_dev2host): Likewise.
+ (gomp_map_vars_existing): Add goacc_asyncqueue parameter, adjust code.
+ (gomp_map_pointer): Likewise.
+ (gomp_map_fields_existing): Likewise.
+ (gomp_map_vars): Add function for compatiblity.
+ (gomp_map_vars_async): Adapt from gomp_map_vars, add goacc_asyncqueue
+ parameter.
+ (gomp_unmap_tgt): Remove statis, add attribute_hidden.
+ (gomp_unmap_vars): Add function for compatiblity.
+ (gomp_unmap_vars_async): Adapt from gomp_unmap_vars, add
+ goacc_asyncqueue parameter.
+ (gomp_fini_device): New function.
+ (gomp_exit_data): Adjust gomp_copy_dev2host call.
+ (gomp_load_plugin_for_device): Remove old interface, adjust to load
+ new async interface.
+ (gomp_target_fini): Adjust code to call gomp_fini_device.
+ * plugin/plugin-nvptx.c (struct cuda_map): Remove.
+ (struct ptx_stream): Remove.
+ (struct nvptx_thread): Remove current_stream field.
+ (cuda_map_create): Remove.
+ (cuda_map_destroy): Remove.
+ (map_init): Remove.
+ (map_fini): Remove.
+ (map_pop): Remove.
+ (map_push): Remove.
+ (struct goacc_asyncqueue): Define.
+ (struct nvptx_callback): Define.
+ (struct ptx_free_block): Define.
+ (struct ptx_device): Remove null_stream, active_streams, async_streams,
+ stream_lock, and next fields.
+ (enum ptx_event_type): Remove.
+ (struct ptx_event): Remove.
+ (ptx_event_lock): Remove.
+ (ptx_events): Remove.
+ (init_streams_for_device): Remove.
+ (fini_streams_for_device): Remove.
+ (select_stream_for_async): Remove.
+ (nvptx_init): Remove ptx_events and ptx_event_lock references.
+ (nvptx_attach_host_thread_to_device): Remove CUDA_ERROR_NOT_PERMITTED
+ case.
+ (nvptx_open_device): Add free_blocks initialization, remove
+ init_streams_for_device call.
+ (nvptx_close_device): Remove fini_streams_for_device call, add
+ free_blocks destruct code.
+ (event_gc): Remove.
+ (event_add): Remove.
+ (nvptx_exec): Adjust parameters and code.
+ (nvptx_free): Likewise.
+ (nvptx_host2dev): Remove.
+ (nvptx_dev2host): Remove.
+ (nvptx_set_async): Remove.
+ (nvptx_async_test): Remove.
+ (nvptx_async_test_all): Remove.
+ (nvptx_wait): Remove.
+ (nvptx_wait_async): Remove.
+ (nvptx_wait_all): Remove.
+ (nvptx_wait_all_async): Remove.
+ (nvptx_get_cuda_stream): Remove.
+ (nvptx_set_cuda_stream): Remove.
+ (GOMP_OFFLOAD_alloc): Adjust code.
+ (GOMP_OFFLOAD_free): Likewise.
+ (GOMP_OFFLOAD_openacc_register_async_cleanup): Remove.
+ (GOMP_OFFLOAD_openacc_exec): Adjust parameters and code.
+ (GOMP_OFFLOAD_openacc_async_test_all): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_async): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_all): Remove.
+ (GOMP_OFFLOAD_openacc_async_wait_all_async): Remove.
+ (GOMP_OFFLOAD_openacc_async_set_async): Remove.
+ (cuda_free_argmem): New function.
+ (GOMP_OFFLOAD_openacc_async_exec): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_create_thread_data): Adjust code.
+ (GOMP_OFFLOAD_openacc_cuda_get_stream): Adjust code.
+ (GOMP_OFFLOAD_openacc_cuda_set_stream): Adjust code.
+ (GOMP_OFFLOAD_openacc_async_construct): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_async_destruct): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_async_test): Remove and re-implement.
+ (GOMP_OFFLOAD_openacc_async_synchronize): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_async_serialize): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_async_queue_callback): New plugin hook function.
+ (cuda_callback_wrapper): New function.
+ (cuda_memcpy_sanity_check): New function.
+ (GOMP_OFFLOAD_host2dev): Remove and re-implement.
+ (GOMP_OFFLOAD_dev2host): Remove and re-implement.
+ (GOMP_OFFLOAD_openacc_async_host2dev): New plugin hook function.
+ (GOMP_OFFLOAD_openacc_async_dev2host): New plugin hook function.
+ * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+ (cb_enter_data_start): Adjust testcase.
+ * testsuite/libgomp.oacc-c-c++-common/data-2-lib.c: Adjust testcase.
+ * testsuite/libgomp.oacc-c-c++-common/data-2.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/data-3.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/lib-71.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
+ * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
+
2017-07-19 Tom de Vries <tom@codesourcery.com>
backport from mainline:
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 37d9d23280e..c025069b457 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -55,6 +55,20 @@ enum offload_target_type
OFFLOAD_TARGET_TYPE_HSA = 7
};
+/* Opaque type to represent plugin-dependent implementation of an
+ OpenACC asynchronous queue. */
+struct goacc_asyncqueue;
+
+/* Used to keep a list of active asynchronous queues. */
+struct goacc_asyncqueue_list
+{
+ struct goacc_asyncqueue *aq;
+ struct goacc_asyncqueue_list *next;
+};
+
+typedef struct goacc_asyncqueue *goacc_aq;
+typedef struct goacc_asyncqueue_list *goacc_aq_list;
+
/* Auxiliary struct, used for transferring pairs of addresses from plugin
to libgomp. */
struct addr_pair
@@ -99,22 +113,31 @@ extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_can_run (void *);
extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
+
extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
- void **, int, unsigned *, void *);
-extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
-extern int GOMP_OFFLOAD_openacc_async_test (int);
-extern int GOMP_OFFLOAD_openacc_async_test_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait (int);
-extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
-extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
-extern void GOMP_OFFLOAD_openacc_async_set_async (int);
+ void **, unsigned *, void *);
+extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
+ void **, unsigned *, void *,
+ struct goacc_asyncqueue *);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
+extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
+ struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
+ void (*)(void *), void *);
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
-extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int);
-extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *);
+extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
+ void *);
#ifdef __cplusplus
}
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 50916f2726f..2f1ad4cdf02 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -870,19 +870,23 @@ typedef struct acc_dispatch_t
/* Execute. */
__typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
- /* Async cleanup callback registration. */
- __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
- *register_async_cleanup_func;
-
- /* Asynchronous routines. */
- __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
- __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
- *async_wait_all_async_func;
- __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
+ struct {
+ gomp_mutex_t lock;
+ int nasyncqueue;
+ struct goacc_asyncqueue **asyncqueue;
+ struct goacc_asyncqueue_list *active;
+
+ __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
+
+ __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
+ } async;
/* Create/destroy TLS data. */
__typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
@@ -974,17 +978,31 @@ enum gomp_map_vars_kind
GOMP_MAP_VARS_ENTER_DATA
};
-extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
+extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
unsigned short *);
-
+extern void gomp_copy_host2dev (struct gomp_device_descr *,
+ struct goacc_asyncqueue *,
+ void *, const void *, size_t);
+extern void gomp_copy_dev2host (struct gomp_device_descr *,
+ struct goacc_asyncqueue *,
+ void *, const void *, size_t);
extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
size_t, void **, void **,
size_t *, void *, bool,
enum gomp_map_vars_kind);
+extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
+ struct goacc_asyncqueue *,
+ size_t, void **, void **,
+ size_t *, void *, bool,
+ enum gomp_map_vars_kind);
+extern void gomp_unmap_tgt (struct target_mem_desc *);
extern void gomp_unmap_vars (struct target_mem_desc *, bool);
+extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
+ struct goacc_asyncqueue *);
extern void gomp_init_device (struct gomp_device_descr *);
+extern bool gomp_fini_device (struct gomp_device_descr *);
extern void gomp_unload_device (struct gomp_device_descr *);
extern bool gomp_offload_target_available_p (int);
extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index dbad95d1b4b..b0587b55ac3 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -27,10 +27,85 @@
<http://www.gnu.org/licenses/>. */
#include <assert.h>
+#include <string.h>
#include "openacc.h"
#include "libgomp.h"
#include "oacc-int.h"
+static struct goacc_thread *
+get_goacc_thread (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+ return thr;
+}
+
+static struct gomp_device_descr *
+get_goacc_thread_device (void)
+{
+ struct goacc_thread *thr = goacc_thread ();
+
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
+ return thr->dev;
+}
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+ /* The special value acc_async_noval (-1) maps to the thread-specific
+ default async stream. */
+ if (async == acc_async_noval)
+ async = thr->default_async;
+
+ if (async == acc_async_sync)
+ return NULL;
+
+ if (async < 0)
+ gomp_fatal ("bad async %d", async);
+
+ struct gomp_device_descr *dev = thr->dev;
+
+ if (!create
+ && (async >= dev->openacc.async.nasyncqueue
+ || !dev->openacc.async.asyncqueue[async]))
+ return NULL;
+
+ gomp_mutex_lock (&dev->openacc.async.lock);
+ if (async >= dev->openacc.async.nasyncqueue)
+ {
+ int diff = async + 1 - dev->openacc.async.nasyncqueue;
+ dev->openacc.async.asyncqueue
+ = gomp_realloc (dev->openacc.async.asyncqueue,
+ sizeof (goacc_aq) * (async + 1));
+ memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+ 0, sizeof (goacc_aq) * diff);
+ dev->openacc.async.nasyncqueue = async + 1;
+ }
+
+ if (!dev->openacc.async.asyncqueue[async])
+ {
+ dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+
+ /* Link new async queue into active list. */
+ goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+ n->aq = dev->openacc.async.asyncqueue[async];
+ n->next = dev->openacc.async.active;
+ dev->openacc.async.active = n;
+ }
+ gomp_mutex_unlock (&dev->openacc.async.lock);
+ return dev->openacc.async.asyncqueue[async];
+}
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+ struct goacc_thread *thr = get_goacc_thread ();
+ return lookup_goacc_asyncqueue (thr, true, async);
+}
+
int
acc_async_test (int async)
{
@@ -39,6 +114,9 @@ acc_async_test (int async)
struct goacc_thread *thr = goacc_thread ();
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
+
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_setup_p
@@ -51,10 +129,8 @@ acc_async_test (int async)
prof_info.async_queue = prof_info.async;
}
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
-
- int res = thr->dev->openacc.async_test_func (async);
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+ int res = thr->dev->openacc.async.test_func (aq);
if (profiling_setup_p)
{
@@ -69,6 +145,8 @@ int
acc_async_test_all (void)
{
struct goacc_thread *thr = goacc_thread ();
+ if (!thr || !thr->dev)
+ gomp_fatal ("no device active");
acc_prof_info prof_info;
acc_api_info api_info;
@@ -76,18 +154,22 @@ acc_async_test_all (void)
= __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
false);
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
-
- int res = thr->dev->openacc.async_test_all_func ();
+ int ret = 1;
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+ if (!thr->dev->openacc.async.test_func (l->aq))
+ {
+ ret = 0;
+ break;
+ }
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_setup_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
-
- return res;
+ return ret;
}
void
@@ -113,7 +195,8 @@ acc_wait (int async)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- thr->dev->openacc.async_wait_func (async);
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
+ thr->dev->openacc.async.synchronize_func (aq);
if (profiling_setup_p)
{
@@ -153,7 +236,15 @@ acc_wait_async (int async1, int async2)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- thr->dev->openacc.async_wait_async_func (async1, async2);
+ goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
+ goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
+ if (!aq1)
+ gomp_fatal ("invalid async 1");
+ if (aq1 == aq2)
+ gomp_fatal ("identical parameters");
+
+ thr->dev->openacc.async.synchronize_func (aq1);
+ thr->dev->openacc.async.serialize_func (aq1, aq2);
if (profiling_setup_p)
{
@@ -176,7 +267,12 @@ acc_wait_all (void)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- thr->dev->openacc.async_wait_all_func ();
+ struct gomp_device_descr *dev = get_goacc_thread_device ();
+
+ gomp_mutex_lock (&dev->openacc.async.lock);
+ for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
+ dev->openacc.async.synchronize_func (l->aq);
+ gomp_mutex_unlock (&dev->openacc.async.lock);
if (profiling_setup_p)
{
@@ -219,7 +315,16 @@ acc_wait_all_async (int async)
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- thr->dev->openacc.async_wait_all_async_func (async);
+ goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
+
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+ {
+ thr->dev->openacc.async.synchronize_func (l->aq);
+ if (waiting_queue)
+ thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+ }
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_setup_p)
{
@@ -251,10 +356,72 @@ acc_set_default_async (int async)
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
- struct goacc_thread *thr = goacc_thread ();
+ struct goacc_thread *thr = get_goacc_thread ();
+ thr->default_async = async;
+}
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
+static void
+goacc_async_unmap_tgt (void *ptr)
+{
+ struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
- thr->default_async = async;
+ if (tgt->refcount > 1)
+ tgt->refcount--;
+ else
+ gomp_unmap_tgt (tgt);
+}
+
+attribute_hidden void
+goacc_async_copyout_unmap_vars (struct target_mem_desc *tgt,
+ struct goacc_asyncqueue *aq)
+{
+ struct gomp_device_descr *devicep = tgt->device_descr;
+
+ /* Increment reference to delay freeing of device memory until callback
+ has triggered. */
+ tgt->refcount++;
+ gomp_unmap_vars_async (tgt, true, aq);
+ devicep->openacc.async.queue_callback_func (aq, goacc_async_unmap_tgt,
+ (void *) tgt);
+}
+
+attribute_hidden void
+goacc_async_free (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, void *ptr)
+{
+ if (!aq)
+ free (ptr);
+ else
+ devicep->openacc.async.queue_callback_func (aq, free, ptr);
+}
+
+attribute_hidden void
+goacc_init_asyncqueues (struct gomp_device_descr *devicep)
+{
+ gomp_mutex_init (&devicep->openacc.async.lock);
+ devicep->openacc.async.nasyncqueue = 0;
+ devicep->openacc.async.asyncqueue = NULL;
+ devicep->openacc.async.active = NULL;
+}
+
+attribute_hidden bool
+goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
+{
+ bool ret = true;
+ if (devicep->openacc.async.nasyncqueue > 0)
+ {
+ goacc_aq_list next;
+ for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+ {
+ ret &= devicep->openacc.async.destruct_func (l->aq);
+ next = l->next;
+ free (l);
+ }
+ free (devicep->openacc.async.asyncqueue);
+ devicep->openacc.async.nasyncqueue = 0;
+ devicep->openacc.async.asyncqueue = NULL;
+ devicep->openacc.async.active = NULL;
+ }
+ gomp_mutex_destroy (&devicep->openacc.async.lock);
+ return ret;
}
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index 325fc8dd29b..b6a89a843ec 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -99,17 +99,12 @@ acc_get_cuda_stream (int async)
prof_info.async_queue = prof_info.async;
}
- void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
- ret = thr->dev->openacc.cuda.get_stream_func (async);
-
- if (profiling_setup_p)
{
- thr->prof_info = NULL;
- thr->api_info = NULL;
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+ return aq ? thr->dev->openacc.cuda.get_stream_func (aq) : NULL;
}
-
- return ret;
+ return NULL;
}
int
@@ -138,7 +133,12 @@ acc_set_cuda_stream (int async, void *stream)
int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
- ret = thr->dev->openacc.cuda.set_stream_func (async, stream);
+ {
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+ }
if (profiling_setup_p)
{
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 7f2f04112a5..4c25a9ac4c9 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -140,7 +140,6 @@ host_openacc_exec (void (*fn) (void *),
size_t mapnum __attribute__ ((unused)),
void **hostaddrs,
void **devaddrs __attribute__ ((unused)),
- int async __attribute__ ((unused)),
unsigned *dims __attribute__ ((unused)),
void *targ_mem_desc __attribute__ ((unused)))
{
@@ -148,47 +147,79 @@ host_openacc_exec (void (*fn) (void *),
}
static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
- int async __attribute__ ((unused)))
+host_openacc_async_exec (void (*fn) (void *),
+ size_t mapnum __attribute__ ((unused)),
+ void **hostaddrs,
+ void **devaddrs __attribute__ ((unused)),
+ unsigned *dims __attribute__ ((unused)),
+ void *targ_mem_desc __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq __attribute__ ((unused)))
{
+ fn (hostaddrs);
}
static int
-host_openacc_async_test (int async __attribute__ ((unused)))
+host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
{
return 1;
}
-static int
-host_openacc_async_test_all (void)
+static void
+host_openacc_async_synchronize (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
- return 1;
}
static void
-host_openacc_async_wait (int async __attribute__ ((unused)))
+host_openacc_async_serialize (struct goacc_asyncqueue *aq1
+ __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq2
+ __attribute__ ((unused)))
{
}
-static void
-host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
- int async2 __attribute__ ((unused)))
+static bool
+host_openacc_async_host2dev (int ord __attribute__ ((unused)),
+ void *dst __attribute__ ((unused)),
+ const void *src __attribute__ ((unused)),
+ size_t n __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
+ return true;
}
-static void
-host_openacc_async_wait_all (void)
+static bool
+host_openacc_async_dev2host (int ord __attribute__ ((unused)),
+ void *dst __attribute__ ((unused)),
+ const void *src __attribute__ ((unused)),
+ size_t n __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
+ return true;
}
static void
-host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
+host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)),
+ void (*callback_fn)(void *)
+ __attribute__ ((unused)),
+ void *userptr __attribute__ ((unused)))
{
}
-static void
-host_openacc_async_set_async (int async __attribute__ ((unused)))
+static struct goacc_asyncqueue *
+host_openacc_async_construct (void)
{
+ return NULL;
+}
+
+static bool
+host_openacc_async_destruct (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
+{
+ return true;
}
static void *
@@ -235,15 +266,17 @@ static struct gomp_device_descr host_dispatch =
.exec_func = host_openacc_exec,
- .register_async_cleanup_func = host_openacc_register_async_cleanup,
-
- .async_test_func = host_openacc_async_test,
- .async_test_all_func = host_openacc_async_test_all,
- .async_wait_func = host_openacc_async_wait,
- .async_wait_async_func = host_openacc_async_wait_async,
- .async_wait_all_func = host_openacc_async_wait_all,
- .async_wait_all_async_func = host_openacc_async_wait_all_async,
- .async_set_async_func = host_openacc_async_set_async,
+ .async = {
+ .construct_func = host_openacc_async_construct,
+ .destruct_func = host_openacc_async_destruct,
+ .test_func = host_openacc_async_test,
+ .synchronize_func = host_openacc_async_synchronize,
+ .serialize_func = host_openacc_async_serialize,
+ .queue_callback_func = host_openacc_async_queue_callback,
+ .exec_func = host_openacc_async_exec,
+ .dev2host_func = host_openacc_async_dev2host,
+ .host2dev_func = host_openacc_async_host2dev,
+ },
.create_thread_data_func = host_openacc_create_thread_data,
.destroy_thread_data_func = host_openacc_destroy_thread_data,
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index c262caa1444..e77253a64a6 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -390,7 +390,7 @@ acc_shutdown_1 (acc_device_t d)
if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
{
devices_active = true;
- ret &= acc_dev->fini_device_func (acc_dev->target_id);
+ ret &= gomp_fini_device (acc_dev);
acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
}
gomp_mutex_unlock (&acc_dev->lock);
@@ -513,8 +513,6 @@ goacc_attach_host_thread_to_device (int ord)
= acc_dev->openacc.create_thread_data_func (ord);
thr->default_async = acc_async_default;
-
- acc_dev->openacc.async_set_async_func (acc_async_sync);
}
/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 7f8351684a9..7a7fe3b1196 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -109,6 +109,15 @@ void goacc_restore_bind (void);
void goacc_lazy_initialize (void);
void goacc_host_init (void);
+void goacc_init_asyncqueues (struct gomp_device_descr *);
+bool goacc_fini_asyncqueues (struct gomp_device_descr *);
+void goacc_async_copyout_unmap_vars (struct target_mem_desc *,
+ struct goacc_asyncqueue *);
+void goacc_async_free (struct gomp_device_descr *,
+ struct goacc_asyncqueue *, void *);
+struct goacc_asyncqueue *get_goacc_asyncqueue (int);
+struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool, int);
+
void goacc_profiling_initialize (void);
bool goacc_profiling_setup_p (struct goacc_thread *,
acc_prof_info *, acc_api_info *);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 59a6f935c85..e155deffc56 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -224,18 +224,11 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
goto out;
}
- if (async > acc_async_sync)
- thr->dev->openacc.async_set_async_func (async);
-
- bool ret = (from
- ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
- : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
-
- if (async > acc_async_sync)
- thr->dev->openacc.async_set_async_func (acc_async_sync);
-
- if (!ret)
- gomp_fatal ("error in %s", libfnname);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ if (from)
+ gomp_copy_dev2host (thr->dev, aq, h, d, s);
+ else
+ gomp_copy_host2dev (thr->dev, aq, d, h, s);
out:
if (profiling_setup_p)
@@ -381,7 +374,7 @@ acc_is_present (void *h, size_t s)
gomp_mutex_unlock (&acc_dev->lock);
- return n != NULL;
+ return (n ? 1 : 0);
}
/* Create a mapping for host [H,+S] -> device [D,+S] */
@@ -613,17 +606,13 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
+ goacc_aq aq = get_goacc_asyncqueue (async);
- tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
- GOMP_MAP_VARS_OPENACC);
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
+ &kinds, true, GOMP_MAP_VARS_OPENACC);
/* Initialize dynamic refcount. */
tgt->list[0].key->dynamic_refcount = 1;
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
gomp_mutex_lock (&acc_dev->lock);
d = tgt->to_free;
@@ -798,11 +787,8 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (f & FLAG_COPYOUT)
{
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
- acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ gomp_copy_dev2host (acc_dev, aq, h, d, s);
}
gomp_remove_var (acc_dev, n);
}
@@ -904,19 +890,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
d = (void *) (n->tgt->tgt_start + n->tgt_offset
+ (uintptr_t) h - n->host_start);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
+ goacc_aq aq = get_goacc_asyncqueue (async);
if (is_dev)
- acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+ gomp_copy_host2dev (acc_dev, aq, d, h, s);
else
- acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
-
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
+ gomp_copy_dev2host (acc_dev, aq, h, d, s);
gomp_mutex_unlock (&acc_dev->lock);
-
+
if (profiling_setup_p)
{
thr->prof_info = NULL;
@@ -978,7 +960,7 @@ gomp_acc_declare_allocate (bool allocate, size_t mapnum, void **hostaddrs,
void
gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
- void *kinds)
+ void *kinds, int async)
{
struct target_mem_desc *tgt;
struct goacc_thread *thr = goacc_thread ();
@@ -1008,8 +990,9 @@ gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
}
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
- tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
- NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
+ NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
/* Initialize dynamic refcount. */
@@ -1098,11 +1081,15 @@ gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
t->list[i].copy_from = force_copyfrom ? 1 : 0;
break;
}
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
- gomp_unmap_vars (t, true);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+ /* If running synchronously, unmap immediately. */
+ if (async < acc_async_noval)
+ gomp_unmap_vars (t, true);
+ else
+ {
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ goacc_async_copyout_unmap_vars (t, aq);
+ }
}
gomp_mutex_unlock (&acc_dev->lock);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 622c71135ed..9de855b8c48 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -215,7 +215,9 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
fn (hostaddrs);
goto out;
}
-
+ else if (profiling_dispatch_p)
+ api_info.device_api = acc_device_api_cuda;
+
/* Default: let the runtime choose. */
for (i = 0; i != GOMP_DIM_MAX; i++)
dims[i] = 0;
@@ -260,10 +262,14 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
case GOMP_LAUNCH_WAIT:
{
- unsigned num_waits = GOMP_LAUNCH_OP (tag);
+ /* Be careful to cast the op field as a signed 16-bit, and
+ sign-extend to full integer. */
+ int num_waits = ((signed short) GOMP_LAUNCH_OP (tag));
- if (num_waits)
+ if (num_waits > 0)
goacc_wait (async, num_waits, &ap);
+ else if (num_waits == acc_async_noval)
+ acc_wait_all_async (async);
break;
}
@@ -274,8 +280,6 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
}
va_end (ap);
- acc_dev->openacc.async_set_async_func (async);
-
if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
{
k.host_start = (uintptr_t) fn;
@@ -307,8 +311,11 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
- tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
- GOMP_MAP_VARS_OPENACC);
+
+ goacc_aq aq = get_goacc_asyncqueue (async);
+
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+ true, GOMP_MAP_VARS_OPENACC);
if (profiling_dispatch_p)
{
prof_info.event_type = acc_ev_enter_data_end;
@@ -329,14 +336,10 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
devaddrs[i] = NULL;
}
- acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
- async, dims, tgt);
-
- /* If running synchronously, unmap immediately. */
- bool copyfrom = true;
- if (async < acc_async_noval)
+ if (aq == NULL)
{
- unmap:
+ acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+ dims, tgt);
if (profiling_dispatch_p)
{
prof_info.event_type = acc_ev_exit_data_start;
@@ -346,7 +349,8 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
&api_info);
}
- gomp_unmap_vars (tgt, copyfrom);
+ /* If running synchronously, unmap immediately. */
+ gomp_unmap_vars (tgt, true);
if (profiling_dispatch_p)
{
prof_info.event_type = acc_ev_exit_data_end;
@@ -358,27 +362,11 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
}
else
{
- bool async_unmap = false;
- for (size_t i = 0; i < tgt->list_count; i++)
- {
- splay_tree_key k = tgt->list[i].key;
- if (k && k->refcount == 1)
- {
- async_unmap = true;
- break;
- }
- }
- if (async_unmap)
- tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
- else
- {
- copyfrom = false;
- goto unmap;
- }
+ acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+ dims, tgt, aq);
+ goacc_async_copyout_unmap_vars (tgt, aq);
}
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
out:
if (profiling_dispatch_p)
{
@@ -724,7 +712,7 @@ GOACC_enter_exit_data (int device, size_t mapnum,
goto out;
}
- if (num_waits)
+ if (num_waits > 0)
{
va_list ap;
@@ -732,8 +720,8 @@ GOACC_enter_exit_data (int device, size_t mapnum,
goacc_wait (async, num_waits, &ap);
va_end (ap);
}
-
- acc_dev->openacc.async_set_async_func (async);
+ else if (num_waits == acc_async_noval)
+ acc_wait_all_async (async);
/* In c, non-pointers and arrays are represented by a single data clause.
Dynamically allocated arrays and subarrays are represented by a data
@@ -783,7 +771,7 @@ GOACC_enter_exit_data (int device, size_t mapnum,
&sizes[i], &kinds[i]);
else
gomp_acc_insert_pointer (pointer, &hostaddrs[i],
- &sizes[i], &kinds[i]);
+ &sizes[i], &kinds[i], async);
/* Increment 'i' by two because OpenACC requires fortran
arrays to be contiguous, so each PSET is associated with
one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -808,18 +796,18 @@ GOACC_enter_exit_data (int device, size_t mapnum,
if (acc_is_present (hostaddrs[i], sizes[i]))
{
if (finalize)
- acc_delete_finalize (hostaddrs[i], sizes[i]);
+ acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
else
- acc_delete (hostaddrs[i], sizes[i]);
+ acc_delete_async (hostaddrs[i], sizes[i], async);
}
break;
case GOMP_MAP_DECLARE_DEALLOCATE:
case GOMP_MAP_FROM:
case GOMP_MAP_FORCE_FROM:
if (finalize)
- acc_copyout_finalize (hostaddrs[i], sizes[i]);
+ acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
else
- acc_copyout (hostaddrs[i], sizes[i]);
+ acc_copyout_async (hostaddrs[i], sizes[i], async);
break;
default:
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -844,8 +832,6 @@ GOACC_enter_exit_data (int device, size_t mapnum,
}
}
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
out:
if (profiling_dispatch_p)
{
@@ -868,18 +854,22 @@ goacc_wait (int async, int num_waits, va_list *ap)
while (num_waits--)
{
int qid = va_arg (*ap, int);
-
- if (acc_async_test (qid))
+ goacc_aq aq = get_goacc_asyncqueue (qid);
+ if (acc_dev->openacc.async.test_func (aq))
continue;
-
if (async == acc_async_sync)
- acc_wait (qid);
+ acc_dev->openacc.async.synchronize_func (aq);
else if (qid == async)
- ;/* If we're waiting on the same asynchronous queue as we're
+ /* If we're waiting on the same asynchronous queue as we're
launching on, the queue itself will order work as
required, so there's no need to wait explicitly. */
+ ;
else
- acc_dev->openacc.async_wait_async_func (qid, async);
+ {
+ goacc_aq aq2 = get_goacc_asyncqueue (async);
+ acc_dev->openacc.async.synchronize_func (aq);
+ acc_dev->openacc.async.serialize_func (aq, aq2);
+ }
}
}
@@ -957,7 +947,7 @@ GOACC_update (int device, size_t mapnum,
goto out;
}
- if (num_waits)
+ if (num_waits > 0)
{
va_list ap;
@@ -965,8 +955,8 @@ GOACC_update (int device, size_t mapnum,
goacc_wait (async, num_waits, &ap);
va_end (ap);
}
-
- acc_dev->openacc.async_set_async_func (async);
+ else if (num_waits == acc_async_noval)
+ acc_wait_all_async (async);
bool update_device = false;
for (i = 0; i < mapnum; ++i)
@@ -1007,7 +997,7 @@ GOACC_update (int device, size_t mapnum,
/* Fallthru */
case GOMP_MAP_FORCE_TO:
update_device = true;
- acc_update_device (hostaddrs[i], sizes[i]);
+ acc_update_device_async (hostaddrs[i], sizes[i], async);
break;
case GOMP_MAP_FROM:
@@ -1019,7 +1009,7 @@ GOACC_update (int device, size_t mapnum,
/* Fallthru */
case GOMP_MAP_FORCE_FROM:
update_device = false;
- acc_update_self (hostaddrs[i], sizes[i]);
+ acc_update_self_async (hostaddrs[i], sizes[i], async);
break;
default:
@@ -1028,8 +1018,6 @@ GOACC_update (int device, size_t mapnum,
}
}
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
out:
if (profiling_dispatch_p)
{
@@ -1075,7 +1063,7 @@ GOACC_wait (int async, int num_waits, ...)
else if (async == acc_async_sync)
acc_wait_all ();
else if (async == acc_async_noval)
- thr->dev->openacc.async_wait_all_async_func (acc_async_noval);
+ acc_wait_all_async (async);
if (profiling_setup_p)
{
diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c
index 9707b4827ec..055d48e6f06 100644
--- a/libgomp/oacc-plugin.c
+++ b/libgomp/oacc-plugin.c
@@ -30,17 +30,6 @@
#include "oacc-plugin.h"
#include "oacc-int.h"
-void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
-{
- struct target_mem_desc *tgt = ptr;
- struct gomp_device_descr *devicep = tgt->device_descr;
-
- devicep->openacc.async_set_async_func (async);
- gomp_unmap_vars (tgt, true);
- devicep->openacc.async_set_async_func (acc_async_sync);
-}
-
/* Return the target-specific part of the TLS data for the current thread. */
void *
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 2bfc617e83f..9e8289b333c 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -96,200 +96,31 @@ cuda_error (CUresult r)
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
-struct cuda_map
+/* NVPTX/CUDA specific definition of asynchronous queues. */
+struct goacc_asyncqueue
{
- CUdeviceptr d;
- size_t size;
- bool active;
- struct cuda_map *next;
+ CUstream cuda_stream;
+ pthread_mutex_t lock;
};
-struct ptx_stream
+struct nvptx_callback
{
- CUstream stream;
- pthread_t host_thread;
- bool multithreaded;
- struct cuda_map *map;
- struct ptx_stream *next;
+ void (*fn) (void *);
+ void *ptr;
+ struct goacc_asyncqueue *aq;
+ struct nvptx_callback *next;
};
/* Thread-specific data for PTX. */
struct nvptx_thread
{
- struct ptx_stream *current_stream;
+ /* We currently have this embedded inside the plugin because libgomp manages
+ devices through integer target_ids. This might be better if using an
+ opaque target-specific pointer directly from gomp_device_descr. */
struct ptx_device *ptx_dev;
};
-static struct cuda_map *
-cuda_map_create (struct goacc_thread *thr, size_t size)
-{
- struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
-
- assert (map);
-
- map->next = NULL;
- map->size = size;
- map->active = false;
-
- CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
- assert (map->d);
-
- bool profiling_dispatch_p
- = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
-
- prof_info->event_type = acc_ev_alloc;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- /* Always implicit for "data mapping arguments for cuLaunchKernel". */
- data_event_info.data_event.implicit = 1;
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = size;
- data_event_info.data_event.host_ptr = NULL;
- data_event_info.data_event.device_ptr = (void *) map->d;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- return map;
-}
-
-static void
-cuda_map_destroy (struct goacc_thread *thr, struct cuda_map *map)
-{
- CUDA_CALL_ASSERT (cuMemFree, map->d);
-
- bool profiling_dispatch_p
- = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
-
- prof_info->event_type = acc_ev_free;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- /* Always implicit for "data mapping arguments for cuLaunchKernel". */
- data_event_info.data_event.implicit = 1;
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = map->size;
- data_event_info.data_event.host_ptr = NULL;
- data_event_info.data_event.device_ptr = (void *) map->d;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- free (map);
-}
-
-/* The following map_* routines manage the CUDA device memory that
- contains the data mapping arguments for cuLaunchKernel. Each
- asynchronous PTX stream may have multiple pending kernel
- invocations, which are launched in a FIFO order. As such, the map
- routines maintains a queue of cuLaunchKernel arguments.
-
- Calls to map_push and map_pop must be guarded by ptx_event_lock.
- Likewise, calls to map_init and map_fini are guarded by
- ptx_dev_lock inside GOMP_OFFLOAD_init_device and
- GOMP_OFFLOAD_fini_device, respectively. */
-
-static bool
-map_init (struct goacc_thread *thr, struct ptx_stream *s)
-{
- int size = getpagesize ();
-
- assert (s);
-
- s->map = cuda_map_create (thr, size);
-
- return true;
-}
-
-static bool
-map_fini (struct goacc_thread *thr, struct ptx_stream *s)
-{
- assert (s->map->next == NULL);
- assert (!s->map->active);
-
- cuda_map_destroy (thr, s->map);
-
- return true;
-}
-
-static void
-map_pop (struct goacc_thread *thr, struct ptx_stream *s)
-{
- struct cuda_map *next;
-
- assert (s != NULL);
-
- if (s->map->next == NULL)
- {
- s->map->active = false;
- return;
- }
-
- next = s->map->next;
- cuda_map_destroy (thr, s->map);
- s->map = next;
-}
-
-static CUdeviceptr
-map_push (struct goacc_thread *thr, struct ptx_stream *s, size_t size)
-{
- struct cuda_map *map = NULL, *t = NULL;
-
- assert (s);
- assert (s->map);
-
- /* Each PTX stream requires a separate data region to store the
- launch arguments for cuLaunchKernel. Allocate a new
- cuda_map and push it to the end of the list. */
- if (s->map->active)
- {
- map = cuda_map_create (thr, size);
-
- for (t = s->map; t->next != NULL; t = t->next)
- ;
-
- t->next = map;
- }
- else if (s->map->size < size)
- {
- cuda_map_destroy (thr, s->map);
- map = cuda_map_create (thr, size);
- }
- else
- map = s->map;
-
- s->map = map;
- s->map->active = true;
-
- return s->map->d;
-}
-
/* Target data function launch information. */
struct targ_fn_launch
@@ -342,22 +173,18 @@ struct ptx_image_data
struct ptx_image_data *next;
};
+struct ptx_free_block
+{
+ void *ptr;
+ struct ptx_free_block *next;
+};
+
struct ptx_device
{
CUcontext ctx;
bool ctx_shared;
CUdevice dev;
- struct ptx_stream *null_stream;
- /* All non-null streams associated with this device (actually context),
- either created implicitly or passed in from the user (via
- acc_set_cuda_stream). */
- struct ptx_stream *active_streams;
- struct {
- struct ptx_stream **arr;
- int size;
- } async_streams;
- /* A lock for use when manipulating the above stream list and array. */
- pthread_mutex_t stream_lock;
+
int ord;
bool overlap;
bool map;
@@ -381,32 +208,13 @@ struct ptx_device
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
-
- struct ptx_device *next;
-};
-
-enum ptx_event_type
-{
- PTX_EVT_MEM,
- PTX_EVT_KNL,
- PTX_EVT_SYNC,
- PTX_EVT_ASYNC_CLEANUP
-};
-struct ptx_event
-{
- CUevent *evt;
- int type;
- void *addr;
- int ord;
- int val;
+ struct ptx_free_block *free_blocks;
+ pthread_mutex_t free_blocks_lock;
- struct ptx_event *next;
+ struct ptx_device *next;
};
-static pthread_mutex_t ptx_event_lock;
-static struct ptx_event *ptx_events;
-
static struct ptx_device **ptx_devices;
static inline struct nvptx_thread *
@@ -415,190 +223,6 @@ nvptx_thread (void)
return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
}
-static bool
-init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
-{
- int i;
- struct ptx_stream *null_stream
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
- null_stream->stream = NULL;
- null_stream->host_thread = pthread_self ();
- null_stream->multithreaded = true;
- if (!map_init (NULL, null_stream))
- return false;
-
- ptx_dev->null_stream = null_stream;
- ptx_dev->active_streams = NULL;
- pthread_mutex_init (&ptx_dev->stream_lock, NULL);
-
- if (concurrency < 1)
- concurrency = 1;
-
- /* This is just a guess -- make space for as many async streams as the
- current device is capable of concurrently executing. This can grow
- later as necessary. No streams are created yet. */
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
- ptx_dev->async_streams.size = concurrency;
-
- for (i = 0; i < concurrency; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-
- return true;
-}
-
-static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
-{
- free (ptx_dev->async_streams.arr);
-
- bool ret = true;
- while (ptx_dev->active_streams != NULL)
- {
- struct ptx_stream *s = ptx_dev->active_streams;
- ptx_dev->active_streams = ptx_dev->active_streams->next;
-
- ret &= map_fini (NULL, s);
-
- CUresult r = cuStreamDestroy (s->stream);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
- ret = false;
- }
- free (s);
- }
-
- ret &= map_fini (NULL, ptx_dev->null_stream);
- free (ptx_dev->null_stream);
- return ret;
-}
-
-/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
- thread THREAD (and also current device/context). If CREATE is true, create
- the stream if it does not exist (or use EXISTING if it is non-NULL), and
- associate the stream with the same thread argument. Returns stream to use
- as result. */
-
-static struct ptx_stream *
-select_stream_for_async (int async, pthread_t thread, bool create,
- CUstream existing)
-{
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
- /* Local copy of TLS variable. */
- struct ptx_device *ptx_dev = nvthd->ptx_dev;
- struct ptx_stream *stream = NULL;
- int orig_async = async;
-
- /* The special value acc_async_noval (-1) maps to the thread-specific
- default async stream. */
- if (async == acc_async_noval)
- async = GOMP_PLUGIN_acc_thread_default_async ();
-
- if (create)
- pthread_mutex_lock (&ptx_dev->stream_lock);
-
- /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
- null stream, and in fact better performance may be obtainable if it doesn't
- (because the null stream enforces overly-strict synchronisation with
- respect to other streams for legacy reasons, and that's probably not
- needed with OpenACC). Maybe investigate later. */
- if (async == acc_async_sync)
- stream = ptx_dev->null_stream;
- else if (async >= 0 && async < ptx_dev->async_streams.size
- && ptx_dev->async_streams.arr[async] && !(create && existing))
- stream = ptx_dev->async_streams.arr[async];
- else if (async >= 0 && create)
- {
- if (async >= ptx_dev->async_streams.size)
- {
- int i, newsize = ptx_dev->async_streams.size * 2;
-
- if (async >= newsize)
- newsize = async + 1;
-
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
- newsize * sizeof (struct ptx_stream *));
-
- for (i = ptx_dev->async_streams.size; i < newsize; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-
- ptx_dev->async_streams.size = newsize;
- }
-
- /* Create a new stream on-demand if there isn't one already, or if we're
- setting a particular async value to an existing (externally-provided)
- stream. */
- if (!ptx_dev->async_streams.arr[async] || existing)
- {
- CUresult r;
- struct ptx_stream *s
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
- if (existing)
- s->stream = existing;
- else
- {
- r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
- if (r != CUDA_SUCCESS)
- {
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
- cuda_error (r));
- }
- }
-
- /* If CREATE is true, we're going to be queueing some work on this
- stream. Associate it with the current host thread. */
- s->host_thread = thread;
- s->multithreaded = false;
-
- if (!map_init (thr, s))
- {
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("map_init fail");
- }
-
- s->next = ptx_dev->active_streams;
- ptx_dev->active_streams = s;
- ptx_dev->async_streams.arr[async] = s;
- }
-
- stream = ptx_dev->async_streams.arr[async];
- }
- else if (async < 0)
- {
- if (create)
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("bad async %d", async);
- }
-
- if (create)
- {
- assert (stream != NULL);
-
- /* If we're trying to use the same stream from different threads
- simultaneously, set stream->multithreaded to true. This affects the
- behaviour of acc_async_test_all and acc_wait_all, which are supposed to
- only wait for asynchronous launches from the same host thread they are
- invoked on. If multiple threads use the same async value, we make note
- of that here and fall back to testing/waiting for all threads in those
- functions. */
- if (thread != stream->host_thread)
- stream->multithreaded = true;
-
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- }
- else if (stream && !stream->multithreaded
- && !pthread_equal (stream->host_thread, thread))
- GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
-
- return stream;
-}
-
/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
should be locked on entry and remains locked on exit. */
@@ -611,9 +235,6 @@ nvptx_init (void)
return true;
CUDA_CALL (cuInit, 0);
- ptx_events = NULL;
- pthread_mutex_init (&ptx_event_lock, NULL);
-
CUDA_CALL (cuDeviceGetCount, &ndevs);
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
* ndevs);
@@ -632,6 +253,11 @@ nvptx_attach_host_thread_to_device (int n)
CUcontext thd_ctx;
r = cuCtxGetDevice (&dev);
+ if (r == CUDA_ERROR_NOT_PERMITTED)
+ {
+ /* Assume we're in a CUDA callback, just return true. */
+ return true;
+ }
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@@ -759,6 +385,9 @@ nvptx_open_device (int n)
ptx_dev->images = NULL;
pthread_mutex_init (&ptx_dev->image_lock, NULL);
+ ptx_dev->free_blocks = NULL;
+ pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
+
GOMP_PLUGIN_debug (0, "Nvidia device %d:\n\tGPU_OVERLAP = %d\n"
"\tCAN_MAP_HOST_MEMORY = %d\n\tCONCURRENT_KERNELS = %d\n"
"\tCOMPUTE_MODE = %d\n\tINTEGRATED = %d\n"
@@ -775,9 +404,6 @@ nvptx_open_device (int n)
ptx_dev->max_registers_per_multiprocessor,
ptx_dev->max_shared_memory_per_multiprocessor);
- if (!init_streams_for_device (ptx_dev, async_engines))
- return NULL;
-
return ptx_dev;
}
@@ -787,9 +413,15 @@ nvptx_close_device (struct ptx_device *ptx_dev)
if (!ptx_dev)
return true;
- if (!fini_streams_for_device (ptx_dev))
- return false;
-
+ for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
+ {
+ struct ptx_free_block *b_next = b->next;
+ CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
+ free (b);
+ b = b_next;
+ }
+
+ pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
pthread_mutex_destroy (&ptx_dev->image_lock);
if (!ptx_dev->ctx_shared)
@@ -913,134 +545,14 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
}
static void
-event_gc (bool memmap_lockable)
-{
- struct ptx_event *ptx_event = ptx_events;
- struct ptx_event *async_cleanups = NULL;
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
-
- pthread_mutex_lock (&ptx_event_lock);
-
- while (ptx_event != NULL)
- {
- CUresult r;
- struct ptx_event *e = ptx_event;
-
- ptx_event = ptx_event->next;
-
- if (e->ord != nvthd->ptx_dev->ord)
- continue;
-
- r = cuEventQuery (*e->evt);
- if (r == CUDA_SUCCESS)
- {
- bool append_async = false;
- CUevent *te;
-
- te = e->evt;
-
- switch (e->type)
- {
- case PTX_EVT_MEM:
- case PTX_EVT_SYNC:
- break;
-
- case PTX_EVT_KNL:
- map_pop (thr, e->addr);
- break;
-
- case PTX_EVT_ASYNC_CLEANUP:
- {
- /* The function gomp_plugin_async_unmap_vars needs to claim the
- memory-map splay tree lock for the current device, so we
- can't call it when one of our callers has already claimed
- the lock. In that case, just delay the GC for this event
- until later. */
- if (!memmap_lockable)
- continue;
-
- append_async = true;
- }
- break;
- }
-
- cuEventDestroy (*te);
- free ((void *)te);
-
- /* Unlink 'e' from ptx_events list. */
- if (ptx_events == e)
- ptx_events = ptx_events->next;
- else
- {
- struct ptx_event *e_ = ptx_events;
- while (e_->next != e)
- e_ = e_->next;
- e_->next = e_->next->next;
- }
-
- if (append_async)
- {
- e->next = async_cleanups;
- async_cleanups = e;
- }
- else
- free (e);
- }
- }
-
- pthread_mutex_unlock (&ptx_event_lock);
-
- /* We have to do these here, after ptx_event_lock is released. */
- while (async_cleanups)
- {
- struct ptx_event *e = async_cleanups;
- async_cleanups = async_cleanups->next;
-
- GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
- free (e);
- }
-}
-
-static void
-event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
-{
- struct ptx_event *ptx_event;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
- || type == PTX_EVT_ASYNC_CLEANUP);
-
- ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
- ptx_event->type = type;
- ptx_event->evt = e;
- ptx_event->addr = h;
- ptx_event->ord = nvthd->ptx_dev->ord;
- ptx_event->val = val;
-
- pthread_mutex_lock (&ptx_event_lock);
-
- ptx_event->next = ptx_events;
- ptx_events = ptx_event;
-
- pthread_mutex_unlock (&ptx_event_lock);
-}
-
-static void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- int async, unsigned *dims, void *targ_mem_desc)
+ unsigned *dims, void *targ_mem_desc,
+ CUdeviceptr dp, CUstream stream)
{
struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
CUfunction function;
- CUresult r;
int i;
- struct ptx_stream *dev_str;
void *kargs[1];
- void *hp;
- CUdeviceptr dp;
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
- const char *maybe_abort_msg = "(perhaps abort was called)";
int cpu_size = nvptx_thread ()->ptx_dev->max_threads_per_multiprocessor;
int block_size = nvptx_thread ()->ptx_dev->max_threads_per_block;
int dev_size = nvptx_thread ()->ptx_dev->multiprocessor_count;
@@ -1052,9 +564,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
function = targ_fn->fn;
- dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
- assert (dev_str == nvthd->current_stream);
-
/* Initialize the launch dimensions. Typically this is constant,
provided by the device compiler, but we must permit runtime
values. */
@@ -1185,61 +694,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
threads_per_block);
}
- /* This reserves a chunk of a pre-allocated page of memory mapped on both
- the host and the device. HP is a host pointer to the new chunk, and DP is
- the corresponding device pointer. */
- pthread_mutex_lock (&ptx_event_lock);
- dp = map_push (thr, dev_str, mapnum * sizeof (void *));
- pthread_mutex_unlock (&ptx_event_lock);
-
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-
- /* Copy the array of arguments to the mapped page. */
- hp = alloca(sizeof(void *) * mapnum);
- for (i = 0; i < mapnum; i++)
- ((void **) hp)[i] = devaddrs[i] != 0 ? devaddrs[i] : hostaddrs[i];
-
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
- fact have the same value on a unified-memory system). */
-
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_start;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- /* Always implicit for "data mapping arguments for cuLaunchKernel". */
- data_event_info.data_event.implicit = 1;
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = mapnum * sizeof (void *);
- data_event_info.data_event.host_ptr = hp;
- data_event_info.data_event.device_ptr = (void *) dp;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
- mapnum * sizeof (void *));
-
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_end;
- data_event_info.data_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" gangs=%u, workers=%u, vectors=%u\n",
__FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1251,7 +705,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
// num_workers ntid.y
// vector length ntid.x
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
acc_event_info enqueue_launch_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
if (profiling_dispatch_p)
{
prof_info->event_type = acc_ev_enqueue_launch_start;
@@ -1279,11 +737,13 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
api_info);
}
+
kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
- 0, dev_str->stream, kargs, 0);
+ 0, stream, kargs, 0);
+
if (profiling_dispatch_p)
{
prof_info->event_type = acc_ev_enqueue_launch_end;
@@ -1293,91 +753,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
api_info);
}
- acc_event_info wait_event_info;
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_wait_start;
-
- wait_event_info.other_event.event_type = prof_info->event_type;
- wait_event_info.other_event.valid_bytes
- = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
- wait_event_info.other_event.parent_construct
- /* TODO = compute_construct_event_info.other_event.parent_construct */
- = acc_construct_parallel; //TODO: kernels...
- wait_event_info.other_event.implicit = 1;
- wait_event_info.other_event.tool_info = NULL;
-
- api_info->device_api = acc_device_api_cuda;
- }
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
- {
- if (profiling_dispatch_p)
- {
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
- r = cuStreamSynchronize (dev_str->stream);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_wait_end;
- wait_event_info.other_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
- }
- else
- {
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
- event_gc (true);
-
- CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
-
- event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
- }
-#else
- if (profiling_dispatch_p)
- {
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
- r = cuCtxSynchronize ();
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_wait_end;
- wait_event_info.other_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-#endif
-
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
targ_fn->launch->fn);
-
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
-#endif
- map_pop (thr, dev_str);
}
void * openacc_get_current_cuda_context (void);
@@ -1420,491 +797,35 @@ nvptx_alloc (size_t s)
}
static bool
-nvptx_free (void *p)
+nvptx_free (void *p, struct ptx_device *ptx_dev)
{
- CUdeviceptr pb;
- size_t ps;
-
- CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
- if ((CUdeviceptr) p != pb)
+ /* Assume callback context if this is null. */
+ if (GOMP_PLUGIN_goacc_thread () == NULL)
{
- GOMP_PLUGIN_error ("invalid device address");
- return false;
+ struct ptx_free_block *n
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+ n->ptr = p;
+ pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+ n->next = ptx_dev->free_blocks;
+ ptx_dev->free_blocks = n;
+ pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+ return true;
}
- CUDA_CALL (cuMemFree, (CUdeviceptr) p);
-
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- acc_prof_info *prof_info = thr->prof_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_free;
-
- acc_event_info data_event_info;
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- data_event_info.data_event.implicit = 1; //TODO
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = ps;
- data_event_info.data_event.host_ptr = NULL;
- data_event_info.data_event.device_ptr = p;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- return true;
-}
-
-
-static bool
-nvptx_host2dev (void *d, const void *h, size_t s)
-{
- CUdeviceptr pb;
- size_t ps;
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
-
- if (!s)
- return true;
- if (!d)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
-
- CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
- if (!pb)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
- if (!h)
- {
- GOMP_PLUGIN_error ("invalid host address");
- return false;
- }
- if (d == h)
- {
- GOMP_PLUGIN_error ("invalid host or device address");
- return false;
- }
- if ((void *)(d + s) > (void *)(pb + ps))
- {
- GOMP_PLUGIN_error ("invalid size");
- return false;
- }
-
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_start;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- data_event_info.data_event.implicit = 1; //TODO
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = s;
- data_event_info.data_event.host_ptr = /* TODO */ (void *) h;
- data_event_info.data_event.device_ptr = d;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
-#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- event_gc (false);
- CUDA_CALL (cuMemcpyHtoDAsync,
- (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
- CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_MEM, e, (void *)h, 0);
- }
- else
-#endif
- CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
-
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_upload_end;
- data_event_info.data_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
- return true;
-}
-
-static bool
-nvptx_dev2host (void *h, const void *d, size_t s)
-{
CUdeviceptr pb;
size_t ps;
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
-
- if (!s)
- return true;
- if (!d)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
- CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
- if (!pb)
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
+ if ((CUdeviceptr) p != pb)
{
GOMP_PLUGIN_error ("invalid device address");
return false;
}
- if (!h)
- {
- GOMP_PLUGIN_error ("invalid host address");
- return false;
- }
- if (d == h)
- {
- GOMP_PLUGIN_error ("invalid host or device address");
- return false;
- }
- if ((void *)(d + s) > (void *)(pb + ps))
- {
- GOMP_PLUGIN_error ("invalid size");
- return false;
- }
-
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info data_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_download_start;
-
- data_event_info.data_event.event_type = prof_info->event_type;
- data_event_info.data_event.valid_bytes
- = _ACC_DATA_EVENT_INFO_VALID_BYTES;
- data_event_info.data_event.parent_construct
- = acc_construct_parallel; //TODO
- data_event_info.data_event.implicit = 1; //TODO
- data_event_info.data_event.tool_info = NULL;
- data_event_info.data_event.var_name = NULL; //TODO
- data_event_info.data_event.bytes = s;
- data_event_info.data_event.host_ptr = h;
- data_event_info.data_event.device_ptr = /* TODO */ (void *) d;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
-
-#ifndef DISABLE_ASYNC
- if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- event_gc (false);
- CUDA_CALL (cuMemcpyDtoHAsync,
- h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
- CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_MEM, e, (void *)h, 0);
- }
- else
-#endif
- CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
-
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_enqueue_download_end;
- data_event_info.data_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
- api_info);
- }
+ CUDA_CALL (cuMemFree, (CUdeviceptr) p);
return true;
}
-static void
-nvptx_set_async (int async)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- nvthd->current_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-}
-
-static int
-nvptx_async_test (int async)
-{
- CUresult r;
- struct ptx_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
-
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- {
- /* The oacc-parallel.c:goacc_wait function calls this hook to determine
- whether all work has completed on this stream, and if so omits the call
- to the wait hook. If that happens, event_gc might not get called
- (which prevents variables from getting unmapped and their associated
- device storage freed), so call it here. */
- event_gc (true);
- return 1;
- }
- else if (r == CUDA_ERROR_NOT_READY)
- return 0;
-
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
- return 0;
-}
-
-static int
-nvptx_async_test_all (void)
-{
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if ((s->multithreaded || pthread_equal (s->host_thread, self))
- && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
- {
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- return 0;
- }
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-
- return 1;
-}
-
-static void
-nvptx_wait (int async)
-{
- struct ptx_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- if (!s)
- GOMP_PLUGIN_fatal ("unknown async %d", async);
-
- GOMP_PLUGIN_debug (0, " %s: waiting on async=%d\n", __FUNCTION__, async);
-
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- bool profiling_dispatch_p
- = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
- acc_event_info wait_event_info;
- if (profiling_dispatch_p)
- {
- acc_prof_info *prof_info = thr->prof_info;
- acc_api_info *api_info = thr->api_info;
-
- prof_info->event_type = acc_ev_wait_start;
-
- wait_event_info.other_event.event_type = prof_info->event_type;
- wait_event_info.other_event.valid_bytes
- = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
- wait_event_info.other_event.parent_construct
- /* TODO = compute_construct_event_info.other_event.parent_construct */
- = acc_construct_parallel; //TODO: kernels...
- wait_event_info.other_event.implicit = 1;
- wait_event_info.other_event.tool_info = NULL;
-
- api_info->device_api = acc_device_api_cuda;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
- CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
- if (profiling_dispatch_p)
- {
- acc_prof_info *prof_info = thr->prof_info;
- acc_api_info *api_info = thr->api_info;
-
- prof_info->event_type = acc_ev_wait_end;
-
- wait_event_info.other_event.event_type = prof_info->event_type;
-
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
- api_info);
- }
-
- event_gc (true);
-}
-
-static void
-nvptx_wait_async (int async1, int async2)
-{
- CUevent *e;
- struct ptx_stream *s1, *s2;
- pthread_t self = pthread_self ();
-
- /* The stream that is waiting (rather than being waited for) doesn't
- necessarily have to exist already. */
- s2 = select_stream_for_async (async2, self, true, NULL);
-
- s1 = select_stream_for_async (async1, self, false, NULL);
- if (!s1)
- GOMP_PLUGIN_fatal ("invalid async 1\n");
-
- if (s1 == s2)
- GOMP_PLUGIN_fatal ("identical parameters");
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
- event_gc (true);
-
- CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
-
- event_add (PTX_EVT_SYNC, e, NULL, 0);
-
- CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
-}
-
-static void
-nvptx_wait_all (void)
-{
- CUresult r;
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- acc_prof_info *prof_info = thr->prof_info;
- acc_event_info wait_event_info;
- acc_api_info *api_info = thr->api_info;
- bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
- if (profiling_dispatch_p)
- {
- wait_event_info.other_event.valid_bytes
- = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
- wait_event_info.other_event.parent_construct
- /* TODO = compute_construct_event_info.other_event.parent_construct */
- = acc_construct_parallel; //TODO: kernels...
- wait_event_info.other_event.implicit = 1;
- wait_event_info.other_event.tool_info = NULL;
-
- api_info->device_api = acc_device_api_cuda;
- }
-
- /* Wait for active streams initiated by this thread (or by multiple threads)
- to complete. */
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if (s->multithreaded || pthread_equal (s->host_thread, self))
- {
- r = cuStreamQuery (s->stream);
- if (r == CUDA_SUCCESS)
- continue;
- else if (r != CUDA_ERROR_NOT_READY)
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_wait_start;
- wait_event_info.other_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
- &wait_event_info,
- api_info);
- }
- CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
- if (profiling_dispatch_p)
- {
- prof_info->event_type = acc_ev_wait_end;
- wait_event_info.other_event.event_type = prof_info->event_type;
- GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
- &wait_event_info,
- api_info);
- }
- }
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-}
-
-static void
-nvptx_wait_all_async (int async)
-{
- struct ptx_stream *waiting_stream, *other_stream;
- CUevent *e;
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_t self = pthread_self ();
-
- /* The stream doing the waiting. This could be the first mention of the
- stream, so create it if necessary. */
- waiting_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-
- /* Launches on the null stream already block on other streams in the
- context. */
- if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
- return;
-
- event_gc (true);
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (other_stream = nvthd->ptx_dev->active_streams;
- other_stream != NULL;
- other_stream = other_stream->next)
- {
- if (!other_stream->multithreaded
- && !pthread_equal (other_stream->host_thread, self))
- continue;
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
- /* Record an event on the waited-for stream. */
- CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
-
- event_add (PTX_EVT_SYNC, e, NULL, 0);
-
- CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-}
-
static void *
nvptx_get_current_cuda_device (void)
{
@@ -1927,70 +848,6 @@ nvptx_get_current_cuda_context (void)
return nvthd->ptx_dev->ctx;
}
-static void *
-nvptx_get_cuda_stream (int async)
-{
- struct ptx_stream *s;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- return s ? s->stream : NULL;
-}
-
-static int
-nvptx_set_cuda_stream (int async, void *stream)
-{
- struct ptx_stream *oldstream;
- pthread_t self = pthread_self ();
- struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
- struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
-
- if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- /* We have a list of active streams and an array mapping async values to
- entries of that list. We need to take "ownership" of the passed-in stream,
- and add it to our list, removing the previous entry also (if there was one)
- in order to prevent resource leaks. Note the potential for surprise
- here: maybe we should keep track of passed-in streams and leave it up to
- the user to tidy those up, but that doesn't work for stream handles
- returned from acc_get_cuda_stream above... */
-
- oldstream = select_stream_for_async (async, self, false, NULL);
-
- if (oldstream)
- {
- if (nvthd->ptx_dev->active_streams == oldstream)
- nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
- else
- {
- struct ptx_stream *s = nvthd->ptx_dev->active_streams;
- while (s->next != oldstream)
- s = s->next;
- s->next = s->next->next;
- }
-
- CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
-
- if (!map_fini (thr, oldstream))
- GOMP_PLUGIN_fatal ("error when freeing host memory");
-
- free (oldstream);
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- (void) select_stream_for_async (async, self, true, (CUstream) stream);
-
- return 1;
-}
-
/* Plugin entry points. */
const char *
@@ -2233,91 +1090,189 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
- return nvptx_alloc (size);
-}
-bool
-GOMP_OFFLOAD_free (int ord, void *ptr)
-{
- return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_free (ptr));
-}
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ struct ptx_free_block *blocks, *tmp;
-bool
-GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
-{
- return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_dev2host (dst, src, n));
+ pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+ blocks = ptx_dev->free_blocks;
+ ptx_dev->free_blocks = NULL;
+ pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+
+ while (blocks)
+ {
+ tmp = blocks->next;
+ nvptx_free (blocks->ptr, ptx_dev);
+ free (blocks);
+ blocks = tmp;
+ }
+
+ return nvptx_alloc (size);
}
bool
-GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+GOMP_OFFLOAD_free (int ord, void *ptr)
{
return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_host2dev (dst, src, n));
+ && nvptx_free (ptr, ptx_devices[ord]));
}
-void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
-
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
void **hostaddrs, void **devaddrs,
- int async, unsigned *dims, void *targ_mem_desc)
+ unsigned *dims, void *targ_mem_desc)
{
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
-}
+ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+ void **hp = NULL;
+ CUdeviceptr dp = 0;
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
-}
+ if (mapnum > 0)
+ {
+ hp = alloca (mapnum * sizeof (void *));
+ for (int i = 0; i < mapnum; i++)
+ hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+ CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+ }
-int
-GOMP_OFFLOAD_openacc_async_test (int async)
-{
- return nvptx_async_test (async);
-}
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info data_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_start;
-int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
- return nvptx_async_test_all ();
-}
+ data_event_info.data_event.event_type = prof_info->event_type;
+ data_event_info.data_event.valid_bytes
+ = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+ data_event_info.data_event.parent_construct
+ = acc_construct_parallel; //TODO
+ /* Always implicit for "data mapping arguments for cuLaunchKernel". */
+ data_event_info.data_event.implicit = 1;
+ data_event_info.data_event.tool_info = NULL;
+ data_event_info.data_event.var_name = NULL; //TODO
+ data_event_info.data_event.bytes = mapnum * sizeof (void *);
+ data_event_info.data_event.host_ptr = hp;
+ data_event_info.data_event.device_ptr = (void *) dp;
-void
-GOMP_OFFLOAD_openacc_async_wait (int async)
-{
- nvptx_wait (async);
-}
+ api_info->device_api = acc_device_api_cuda;
-void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
-{
- nvptx_wait_async (async1, async2);
-}
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
-void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
-{
- nvptx_wait_all ();
+ if (mapnum > 0)
+ CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
+ mapnum * sizeof (void *));
+
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_end;
+ data_event_info.data_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+ dp, NULL);
+
+ CUresult r = cuStreamSynchronize (NULL);
+ const char *maybe_abort_msg = "(perhaps abort was called)";
+ if (r == CUDA_ERROR_LAUNCH_FAILED)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
+ maybe_abort_msg);
+ else if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuMemFree, dp);
}
-void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
+static void
+cuda_free_argmem (void *ptr)
{
- nvptx_wait_all_async (async);
+ void **block = (void **) ptr;
+ nvptx_free (block[0], (struct ptx_device *) block[1]);
+ free (block);
}
void
-GOMP_OFFLOAD_openacc_async_set_async (int async)
+GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs,
+ unsigned *dims, void *targ_mem_desc,
+ struct goacc_asyncqueue *aq)
{
- nvptx_set_async (async);
+ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
+
+ void **hp = NULL;
+ CUdeviceptr dp = 0;
+ void **block = NULL;
+
+ if (mapnum > 0)
+ {
+ block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
+ hp = block + 2;
+ for (int i = 0; i < mapnum; i++)
+ hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+ CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+ }
+
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info data_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_start;
+
+ data_event_info.data_event.event_type = prof_info->event_type;
+ data_event_info.data_event.valid_bytes
+ = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+ data_event_info.data_event.parent_construct
+ = acc_construct_parallel; //TODO
+ /* Always implicit for "data mapping arguments for cuLaunchKernel". */
+ data_event_info.data_event.implicit = 1;
+ data_event_info.data_event.tool_info = NULL;
+ data_event_info.data_event.var_name = NULL; //TODO
+ data_event_info.data_event.bytes = mapnum * sizeof (void *);
+ data_event_info.data_event.host_ptr = hp;
+ data_event_info.data_event.device_ptr = (void *) dp;
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
+ if (mapnum > 0)
+ {
+ CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
+ mapnum * sizeof (void *), aq->cuda_stream);
+ block[0] = (void *) dp;
+
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
+ block[1] = (void *) nvthd->ptx_dev;
+ }
+
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_enqueue_upload_end;
+ data_event_info.data_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+ api_info);
+ }
+
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+ dp, aq->cuda_stream);
+
+ if (mapnum > 0)
+ GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
}
void *
@@ -2339,7 +1294,6 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord)
if (!thd_ctx)
CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
- nvthd->current_stream = ptx_dev->null_stream;
nvthd->ptx_dev = ptx_dev;
return (void *) nvthd;
@@ -2364,17 +1318,169 @@ GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
}
/* NOTE: This returns a CUstream, not a ptx_stream pointer. */
-
void *
-GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
+GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
{
- return nvptx_get_cuda_stream (async);
+ return (void *) aq->cuda_stream;
}
/* NOTE: This takes a CUstream, not a ptx_stream pointer. */
+int
+GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
+{
+ if (aq->cuda_stream)
+ {
+ CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+ CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
+ }
+
+ aq->cuda_stream = (CUstream) stream;
+ return 1;
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (void)
+{
+ struct goacc_asyncqueue *aq
+ = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
+ CUDA_CALL_ASSERT (cuStreamCreate, &aq->cuda_stream, CU_STREAM_DEFAULT);
+ return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+ free (aq);
+ return true;
+}
int
-GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
+{
+ CUresult r = cuStreamQuery (aq->cuda_stream);
+ if (r == CUDA_SUCCESS)
+ return 1;
+ if (r == CUDA_ERROR_NOT_READY)
+ return 0;
+
+ GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
+ return -1;
+}
+
+void
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+ struct goacc_asyncqueue *aq2)
+{
+ CUevent e;
+ CUDA_CALL_ASSERT (cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
+ CUDA_CALL_ASSERT (cuEventRecord, e, aq1->cuda_stream);
+ CUDA_CALL_ASSERT (cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+}
+
+static void
+cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
+{
+ if (res != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
+ struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
+ cb->fn (cb->ptr);
+ free (ptr);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+ void (*callback_fn)(void *),
+ void *userptr)
+{
+ struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
+ b->fn = callback_fn;
+ b->ptr = userptr;
+ b->aq = aq;
+ CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
+ cuda_callback_wrapper, (void *) b, 0);
+}
+
+static bool
+cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
+{
+ CUdeviceptr pb;
+ size_t ps;
+ if (!s)
+ return true;
+ if (!d)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
+ if (!pb)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ if (!h)
+ {
+ GOMP_PLUGIN_error ("invalid host address");
+ return false;
+ }
+ if (d == h)
+ {
+ GOMP_PLUGIN_error ("invalid host or device address");
+ return false;
+ }
+ if ((void *)(d + s) > (void *)(pb + ps))
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (src, dst, n))
+ return false;
+ CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
{
- return nvptx_set_cuda_stream (async, stream);
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (src, dst, n))
+ return false;
+ CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
+ return true;
}
diff --git a/libgomp/target.c b/libgomp/target.c
index 2cffa4986f9..58fa82ea965 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -187,18 +187,44 @@ gomp_device_copy (struct gomp_device_descr *devicep,
}
}
-static void
+static inline void
+goacc_device_copy_async (struct gomp_device_descr *devicep,
+ bool (*copy_func) (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *),
+ const char *dst, void *dstaddr,
+ const char *src, const void *srcaddr,
+ size_t size, struct goacc_asyncqueue *aq)
+{
+ if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
+ src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
+ }
+}
+
+attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq,
void *d, const void *h, size_t sz)
{
- gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
+ if (aq)
+ goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
+ "dev", d, "host", h, sz, aq);
+ else
+ gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
-static void
+attribute_hidden void
gomp_copy_dev2host (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq,
void *h, const void *d, size_t sz)
{
- gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
+ if (aq)
+ goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
+ "host", h, "dev", d, sz, aq);
+ else
+ gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
static void
@@ -216,7 +242,8 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
Helper function of gomp_map_vars. */
static inline void
-gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
+gomp_map_vars_existing (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, splay_tree_key oldn,
splay_tree_key newn, struct target_var_desc *tgt_var,
unsigned char kind)
{
@@ -238,7 +265,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
}
if (GOMP_MAP_ALWAYS_TO_P (kind))
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
@@ -256,8 +283,8 @@ get_kind (bool short_mapkind, void *kinds, int idx)
}
static void
-gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
- uintptr_t target_offset, uintptr_t bias)
+gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
+ uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -268,7 +295,7 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
{
cur_node.tgt_offset = (uintptr_t) NULL;
/* FIXME: see comment about coalescing host/dev transfers below. */
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset,
sizeof (void *));
@@ -291,7 +318,7 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
/* FIXME: see comment about coalescing host/dev transfers below. */
- gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
+ gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset, sizeof (void *));
}
@@ -329,9 +356,9 @@ gomp_map_pset (struct target_mem_desc *tgt, uintptr_t host_ptr,
}
static void
-gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
- size_t first, size_t i, void **hostaddrs,
- size_t *sizes, void *kinds)
+gomp_map_fields_existing (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
+ splay_tree_key n, size_t first, size_t i,
+ void **hostaddrs, size_t *sizes, void *kinds)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -348,7 +375,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node,
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node,
&tgt->list[i], kind & typemask);
return;
}
@@ -364,7 +391,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->host_start - n->host_start
== n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
kind & typemask);
return;
}
@@ -376,7 +403,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
&& n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
kind & typemask);
return;
}
@@ -547,6 +574,18 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
{
+ struct target_mem_desc *tgt;
+ tgt = gomp_map_vars_async (devicep, NULL, mapnum, hostaddrs, devaddrs,
+ sizes, kinds, short_mapkind, pragma_kind);
+ return tgt;
+}
+
+attribute_hidden struct target_mem_desc *
+gomp_map_vars_async (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+ bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+{
size_t i, tgt_align, tgt_size, not_found_cnt = 0;
bool has_firstprivate = false;
const int rshift = short_mapkind ? 8 : 3;
@@ -665,7 +704,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
continue;
}
for (i = first; i <= last; i++)
- gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+ gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds);
i--;
continue;
@@ -722,7 +761,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
else
n = splay_tree_lookup (mem_map, &cur_node);
if (n && n->refcount != REFCOUNT_LINK)
- gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
kind & typemask);
else
{
@@ -790,7 +829,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
if (n)
{
assert (n->refcount != REFCOUNT_LINK);
- gomp_map_vars_existing (devicep, n, &cur_node, row_desc,
+ gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
kind & typemask);
}
else
@@ -866,7 +905,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt->list[i].offset = tgt_size;
len = sizes[i];
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + tgt_size),
(void *) hostaddrs[i], len);
tgt_size += len;
@@ -900,7 +939,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
continue;
}
for (i = first; i <= last; i++)
- gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+ gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds);
i--;
continue;
@@ -920,7 +959,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i];
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (n->tgt->tgt_start
+ n->tgt_offset
+ cur_node.host_start
@@ -950,7 +989,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
k->host_end = k->host_start + sizeof (void *);
splay_tree_key n = splay_tree_lookup (mem_map, k);
if (n && n->refcount != REFCOUNT_LINK)
- gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
kind & typemask);
else
{
@@ -1006,14 +1045,15 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* FIXME: Perhaps add some smarts, like if copying
several adjacent fields from host to target, use some
host buffer to avoid sending each var individually. */
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
k->host_end - k->host_start);
break;
case GOMP_MAP_POINTER:
- gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
+ gomp_map_pointer (tgt, aq,
+ (uintptr_t) *(void **) k->host_start,
k->tgt_offset, sizes[i]);
break;
case GOMP_MAP_TO_PSET:
@@ -1042,7 +1082,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
sizes[j]);
tptr = *(uintptr_t *) hostaddrs[i];
*(uintptr_t *) hostaddrs[i]= toffset;
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
@@ -1052,7 +1092,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
found_pointer = true;
}
if (!found_pointer)
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
@@ -1079,7 +1119,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
break;
case GOMP_MAP_FORCE_DEVICEPTR:
assert (k->host_end - k->host_start == sizeof (void *));
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
@@ -1096,9 +1136,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
/* Set link pointer on target to the device address of the
mapped object. */
void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
- devicep->host2dev_func (devicep->target_id,
- (void *) n->tgt_offset,
- &tgt_addr, sizeof (void *));
+ gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
+ &tgt_addr, sizeof (void *));
}
array++;
}
@@ -1142,7 +1181,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
if (n)
{
assert (n->refcount != REFCOUNT_LINK);
- gomp_map_vars_existing (devicep, n, &cur_node, row_desc,
+ gomp_map_vars_existing (devicep, aq, n, &cur_node, row_desc,
kind & typemask);
target_row_addr = n->tgt->tgt_start + n->tgt_offset;
}
@@ -1166,7 +1205,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
row_desc->copy_from
= GOMP_MAP_COPY_FROM_P (kind & typemask);
row_desc->always_copy_from
- = GOMP_MAP_COPY_FROM_P (kind & typemask);
+ = GOMP_MAP_ALWAYS_FROM_P (kind & typemask);
row_desc->offset = 0;
row_desc->length = da->data_row_size;
@@ -1175,7 +1214,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
splay_tree_insert (mem_map, array);
if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) tgt->tgt_start + k->tgt_offset,
(void *) k->host_start,
da->data_row_size);
@@ -1191,9 +1230,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
{
void *ptrblock = gomp_dynamic_array_create_ptrblock
(da, target_ptrblock, target_data_rows + row_start);
- gomp_copy_host2dev (devicep, target_ptrblock, ptrblock,
+ gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
da->ptrblock_size);
- free (ptrblock);
+ /* Freeing of the ptrblock must be scheduled after the host2dev
+ copy completes. */
+ goacc_async_free (devicep, aq, ptrblock);
}
row_start += da->data_row_num;
@@ -1213,7 +1254,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
{
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
/* FIXME: see above FIXME comment. */
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *));
}
@@ -1232,7 +1273,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
return tgt;
}
-static void
+attribute_hidden void
gomp_unmap_tgt (struct target_mem_desc *tgt)
{
/* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */
@@ -1267,6 +1308,13 @@ gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
attribute_hidden void
gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
{
+ gomp_unmap_vars_async (tgt, do_copyfrom, NULL);
+}
+
+attribute_hidden void
+gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
+ struct goacc_asyncqueue *aq)
+{
struct gomp_device_descr *devicep = tgt->device_descr;
if (tgt->list_count == 0)
@@ -1302,7 +1350,7 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
|| tgt->list[i].always_copy_from)
- gomp_copy_dev2host (devicep,
+ gomp_copy_dev2host (devicep, aq,
(void *) (k->host_start + tgt->list[i].offset),
(void *) (k->tgt->tgt_start + k->tgt_offset
+ tgt->list[i].offset),
@@ -1368,9 +1416,9 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t size = cur_node.host_end - cur_node.host_start;
if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+ gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
- gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
+ gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
}
}
gomp_mutex_unlock (&devicep->lock);
@@ -1691,9 +1739,21 @@ gomp_init_device (struct gomp_device_descr *devicep)
false);
}
+ /* Initialize OpenACC asynchronous queues. */
+ goacc_init_asyncqueues (devicep);
+
devicep->state = GOMP_DEVICE_INITIALIZED;
}
+attribute_hidden bool
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+ devicep->state = GOMP_DEVICE_FINALIZED;
+ bool ret = goacc_fini_asyncqueues (devicep);
+ ret &= devicep->fini_device_func (devicep->target_id);
+ return ret;
+}
+
attribute_hidden void
gomp_unload_device (struct gomp_device_descr *devicep)
{
@@ -2222,7 +2282,7 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
if ((kind == GOMP_MAP_FROM && k->refcount == 0)
|| kind == GOMP_MAP_ALWAYS_FROM)
- gomp_copy_dev2host (devicep, (void *) cur_node.host_start,
+ gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset
+ cur_node.host_start
- k->host_start),
@@ -2848,20 +2908,20 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
{
if (!DLSYM_OPT (openacc.exec, openacc_exec)
- || !DLSYM_OPT (openacc.register_async_cleanup,
- openacc_register_async_cleanup)
- || !DLSYM_OPT (openacc.async_test, openacc_async_test)
- || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
- || !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
- || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
- || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
- || !DLSYM_OPT (openacc.async_wait_all_async,
- openacc_async_wait_all_async)
- || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
|| !DLSYM_OPT (openacc.create_thread_data,
openacc_create_thread_data)
|| !DLSYM_OPT (openacc.destroy_thread_data,
- openacc_destroy_thread_data))
+ openacc_destroy_thread_data)
+ || !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
+ || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
+ || !DLSYM_OPT (openacc.async.test, openacc_async_test)
+ || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
+ || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
+ || !DLSYM_OPT (openacc.async.queue_callback,
+ openacc_async_queue_callback)
+ || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
+ || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
+ || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
{
/* Require all the OpenACC handlers if we have
GOMP_OFFLOAD_CAP_OPENACC_200. */
@@ -2912,10 +2972,7 @@ gomp_target_fini (void)
struct gomp_device_descr *devicep = &devices[i];
gomp_mutex_lock (&devicep->lock);
if (devicep->state == GOMP_DEVICE_INITIALIZED)
- {
- ret = devicep->fini_device_func (devicep->target_id);
- devicep->state = GOMP_DEVICE_FINALIZED;
- }
+ ret = gomp_fini_device (devicep);
gomp_mutex_unlock (&devicep->lock);
if (!ret)
gomp_fatal ("device finalization failed");
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
index de263237a9d..ef37ae99bee 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -206,11 +206,6 @@ void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info,
assert (event_info->other_event.implicit == 1);
assert (event_info->other_event.tool_info == NULL);
- if (acc_device_type == acc_device_host
- || state < 100) //TODO
- assert (api_info->device_api == acc_device_api_none);
- else
- assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
index e1aa2c931ff..1694f582363 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2-lib.c
@@ -151,7 +151,7 @@ main (int argc, char **argv)
d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
#pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N], N) \
- async (4)
+ wait (1, 2, 3) async (4)
for (int ii = 0; ii < N; ii++)
e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index 0228fc3ba18..fd795ca44aa 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -162,7 +162,7 @@ main (int argc, char **argv)
d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
#pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
- wait (1) async (4)
+ wait (1, 2, 3) async (4)
for (int ii = 0; ii < N; ii++)
e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
index 0bf706a1b5d..5ec50b808a7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
@@ -138,7 +138,7 @@ main (int argc, char **argv)
d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
#pragma acc parallel present (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) \
- wait (1,5) async (4)
+ wait (1, 2, 3, 5) async (4)
for (int ii = 0; ii < N; ii++)
e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
index 3458c09aa2e..2eeaecb15d9 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
@@ -46,16 +46,22 @@ main (int argc, char **argv)
abort ();
}
- fprintf (stderr, "CheCKpOInT\n");
- if (acc_async_test (1) != 0)
+ if (acc_async_test (0) != 0)
{
fprintf (stderr, "asynchronous operation not running\n");
abort ();
}
+ /* Test unseen async number. */
+ if (acc_async_test (1) != 1)
+ {
+ fprintf (stderr, "acc_async_test failed on unseen number\n");
+ abort ();
+ }
+
sleep (1);
- if (acc_async_test (1) != 1)
+ if (acc_async_test (0) != 1)
{
fprintf (stderr, "found asynchronous operation still running\n");
abort ();
@@ -65,7 +71,3 @@ main (int argc, char **argv)
return 0;
}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
index a28ec49b08e..c8a474ccabb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
@@ -72,14 +72,13 @@ main (int argc, char **argv)
abort ();
}
- fprintf (stderr, "CheCKpOInT\n");
- acc_wait (1);
+ acc_wait (0);
gettimeofday (&tv2, NULL);
t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
- if (t2 > t1)
+ if (t2 - t1 > 100)
{
fprintf (stderr, "too long 1\n");
abort ();
@@ -87,7 +86,7 @@ main (int argc, char **argv)
gettimeofday (&tv1, NULL);
- acc_wait (1);
+ acc_wait (0);
gettimeofday (&tv2, NULL);
@@ -103,7 +102,3 @@ main (int argc, char **argv)
return 0;
}
-
-/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
-/* { dg-output "unknown async \[0-9\]+" } */
-/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index 572358834d4..457d5fde23e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -84,6 +84,7 @@ main (int argc, char **argv)
for (i = 0; i < N; i++)
{
+ stream = (CUstream) acc_get_cuda_stream (i & 1);
r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
if (r != CUDA_SUCCESS)
{
@@ -92,10 +93,10 @@ main (int argc, char **argv)
}
}
- acc_wait_async (0, 1);
-
if (acc_async_test (0) != 0)
abort ();
+
+ acc_wait_async (0, 1);
if (acc_async_test (1) != 0)
abort ();
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index 77de9ba2904..bf15508e13f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -109,7 +109,7 @@ main (int argc, char **argv)
for (i = 0; i <= N; i++)
{
- if (acc_async_test (i) != 0)
+ if (acc_async_test (i) == 0)
abort ();
}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
index 914a3698846..fe88a121e42 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
@@ -1,4 +1,5 @@
! { dg-do run }
+! { dg-xfail-run-if "n/a" { openacc_host_selected } }
program main
use openacc