aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2017-05-15 06:50:17 +0000
committerThomas Schwinge <thomas@codesourcery.com>2017-05-15 06:50:17 +0000
commit999d7eff338acc9c75d4815f01f778e70e5b113c (patch)
tree8bb3ef658524d938c279652ed22d8b38b33d438a
parentb8f82e0e46aea0bead3fe25289df49315e58a63f (diff)
More OpenACC 2.5 Profiling Interface
libgomp/ * oacc-async.c (acc_async_test, acc_async_test_all, acc_wait) (acc_wait_async, acc_wait_all, acc_wait_all_async): Set up profiling. * oacc-cuda.c (acc_get_current_cuda_device) (acc_get_current_cuda_context, acc_get_cuda_stream) (acc_set_cuda_stream): Likewise. * oacc-init.c (acc_set_device_type, acc_get_device_type) (acc_get_device_num): Likewise. * oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device) (acc_map_data, acc_unmap_data, present_create_copy) (delete_copyout, update_dev_host): Likewise. * oacc-parallel.c (GOACC_data_start, GOACC_data_end) (GOACC_enter_exit_data, GOACC_update, GOACC_wait): Likewise. * oacc-profiling.c (goacc_profiling_setup_p): New function. (goacc_profiling_dispatch_p): Add a "bool" formal parameter. Adjust all users. * oacc-int.h (goacc_profiling_setup_p) (goacc_profiling_dispatch_p): Update. * plugin/plugin-nvptx.c (nvptx_exec, nvptx_wait, nvptx_wait_all): Generate more profiling events. * libgomp.texi (OpenACC Profiling Interface): Update. git-svn-id: https://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@248042 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--libgomp/ChangeLog.gomp24
-rw-r--r--libgomp/libgomp.texi74
-rw-r--r--libgomp/oacc-async.c110
-rw-r--r--libgomp/oacc-cuda.c82
-rw-r--r--libgomp/oacc-init.c102
-rw-r--r--libgomp/oacc-int.h4
-rw-r--r--libgomp/oacc-mem.c154
-rw-r--r--libgomp/oacc-parallel.c357
-rw-r--r--libgomp/oacc-profiling.c100
-rw-r--r--libgomp/plugin/plugin-nvptx.c113
10 files changed, 1056 insertions, 64 deletions
diff --git a/libgomp/ChangeLog.gomp b/libgomp/ChangeLog.gomp
index 5dc0889e97a..23882cf1dbe 100644
--- a/libgomp/ChangeLog.gomp
+++ b/libgomp/ChangeLog.gomp
@@ -1,3 +1,27 @@
+2017-05-15 Thomas Schwinge <thomas@codesourcery.com>
+
+ * oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
+ (acc_wait_async, acc_wait_all, acc_wait_all_async): Set up
+ profiling.
+ * oacc-cuda.c (acc_get_current_cuda_device)
+ (acc_get_current_cuda_context, acc_get_cuda_stream)
+ (acc_set_cuda_stream): Likewise.
+ * oacc-init.c (acc_set_device_type, acc_get_device_type)
+ (acc_get_device_num): Likewise.
+ * oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
+ (acc_map_data, acc_unmap_data, present_create_copy)
+ (delete_copyout, update_dev_host): Likewise.
+ * oacc-parallel.c (GOACC_data_start, GOACC_data_end)
+ (GOACC_enter_exit_data, GOACC_update, GOACC_wait): Likewise.
+ * oacc-profiling.c (goacc_profiling_setup_p): New function.
+ (goacc_profiling_dispatch_p): Add a "bool" formal parameter.
+ Adjust all users.
+ * oacc-int.h (goacc_profiling_setup_p)
+ (goacc_profiling_dispatch_p): Update.
+ * plugin/plugin-nvptx.c (nvptx_exec, nvptx_wait, nvptx_wait_all):
+ Generate more profiling events.
+ * libgomp.texi (OpenACC Profiling Interface): Update.
+
2017-05-14 Thomas Schwinge <thomas@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: New
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 93365cd4cce..b3fa1395f47 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -3207,12 +3207,19 @@ Will be @code{acc_construct_parallel} for OpenACC kernels constructs;
should be @code{acc_construct_kernels}.
@item
+Will be @code{acc_construct_enter_data} or
+@code{acc_construct_exit_data} when processing variable mappings
+specified in OpenACC declare directives; should be
+@code{acc_construct_declare}.
+
+@item
For implicit @code{acc_ev_device_init_start},
@code{acc_ev_device_init_end}, and explicit as well as implicit
@code{acc_ev_alloc}, @code{acc_ev_free},
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
-@code{acc_ev_enqueue_download_start}, and
-@code{acc_ev_enqueue_download_end}, will be
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, will be
@code{acc_construct_parallel}; should reflect the real parent
construct.
@@ -3221,8 +3228,9 @@ construct.
@item @code{acc_event_info.*.implicit}
For @code{acc_ev_alloc}, @code{acc_ev_free},
@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
-@code{acc_ev_enqueue_download_start}, and
-@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, this currently will be @code{1}
also for explicit usage.
@item @code{acc_event_info.data_event.var_name}
@@ -3289,6 +3297,20 @@ it's not clear if they should be.
@end itemize
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@itemize
+
+@item
+Callbacks for these event types will also be invoked for OpenACC
+host_data constructs; it's not clear if they should be.
+
+@item
+Callbacks for these event types will also be invoked when processing
+variable mappings specified in OpenACC declare directives; it's not
+clear if they should be.
+
+@end itemize
+
@end table
Callbacks for the following event types will be invoked, but dispatch
@@ -3297,8 +3319,10 @@ and information provided therein has not yet been thoroughly reviewed:
@itemize
@item @code{acc_ev_alloc}
@item @code{acc_ev_free}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
@end itemize
During device initialization, and finalization, respectively,
@@ -3309,14 +3333,6 @@ callbacks for the following event types will not yet be invoked:
@item @code{acc_ev_free}
@end itemize
-Callbacks for the following event types will currently only be invoked
-for (implicit) events within compute constructs:
-
-@itemize
-@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}
-@item @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
-@end itemize
-
Callbacks for the following event types have not yet been implemented,
so currently won't be invoked:
@@ -3324,8 +3340,38 @@ so currently won't be invoked:
@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
@item @code{acc_ev_runtime_shutdown}
@item @code{acc_ev_create}, @code{acc_ev_delete}
-@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
-@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+For the following runtime library functions, not all expected
+callbacks will be invoked (mostly concerning implicit device
+initialization):
+
+@itemize
+@item @code{acc_get_num_devices}
+@item @code{acc_set_device_type}
+@item @code{acc_get_device_type}
+@item @code{acc_set_device_num}
+@item @code{acc_get_device_num}
+@item @code{acc_init}
+@item @code{acc_shutdown}
+@end itemize
+
+Aside from implicit device initialization, for the following runtime
+library functions, no callbacks will be invoked for shared-memory
+offloading devices (it's not clear if they should be):
+
+@itemize
+@item @code{acc_malloc}
+@item @code{acc_free}
+@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
+@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
+@item @code{acc_copyout}, @code{acc_copyout_async}
+@item @code{acc_delete}, @code{acc_delete_async}
+@item @code{acc_update_device}, @code{acc_update_device_async}
+@item @code{acc_update_self}, @code{acc_update_self_async}
+@item @code{acc_map_data}, @code{acc_unmap_data}
+@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
+@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
@end itemize
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 921f94366af..7cefa0fa8cf 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -39,10 +39,30 @@ acc_async_test (int async)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- return thr->dev->openacc.async_test_func (async);
+ int res = thr->dev->openacc.async_test_func (async);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return res;
}
int
@@ -50,10 +70,24 @@ acc_async_test_all (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- return thr->dev->openacc.async_test_all_func ();
+ int res = thr->dev->openacc.async_test_all_func ();
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return res;
}
void
@@ -64,10 +98,28 @@ acc_wait (int async)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
thr->dev->openacc.async_wait_func (async);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -75,10 +127,28 @@ acc_wait_async (int async1, int async2)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async2; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
thr->dev->openacc.async_wait_async_func (async1, async2);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -86,10 +156,22 @@ acc_wait_all (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
thr->dev->openacc.async_wait_all_func ();
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -100,15 +182,36 @@ acc_wait_all_async (int async)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
if (!thr || !thr->dev)
gomp_fatal ("no device active");
thr->dev->openacc.async_wait_all_async_func (async);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
int
acc_get_default_async (void)
{
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
@@ -120,6 +223,9 @@ acc_get_default_async (void)
void
acc_set_default_async (int async)
{
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
if (async < acc_async_sync)
gomp_fatal ("invalid async argument: %d", async);
diff --git a/libgomp/oacc-cuda.c b/libgomp/oacc-cuda.c
index 86a2a775727..325fc8dd29b 100644
--- a/libgomp/oacc-cuda.c
+++ b/libgomp/oacc-cuda.c
@@ -36,10 +36,23 @@ acc_get_current_cuda_device (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
+ void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
- return thr->dev->openacc.cuda.get_current_device_func ();
+ ret = thr->dev->openacc.cuda.get_current_device_func ();
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
- return NULL;
+ return ret;
}
void *
@@ -47,10 +60,23 @@ acc_get_current_cuda_context (void)
{
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
+ void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
- return thr->dev->openacc.cuda.get_current_context_func ();
-
- return NULL;
+ ret = thr->dev->openacc.cuda.get_current_context_func ();
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return ret;
}
void *
@@ -61,10 +87,29 @@ acc_get_cuda_stream (int async)
if (async < 0)
return NULL;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
+ void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
- return thr->dev->openacc.cuda.get_stream_func (async);
+ ret = thr->dev->openacc.cuda.get_stream_func (async);
- return NULL;
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
+ return ret;
}
int
@@ -79,8 +124,27 @@ acc_set_cuda_stream (int async, void *stream)
thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
+ int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
- return thr->dev->openacc.cuda.set_stream_func (async, stream);
+ ret = thr->dev->openacc.cuda.set_stream_func (async, stream);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
- return -1;
+ return ret;
}
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 415c0faff19..c262caa1444 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -220,8 +220,23 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
static struct gomp_device_descr *
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
+ bool check_not_nested_p;
+ if (implicit)
+ {
+ /* In the implicit case, there should (TODO: must?) already be something
+ have been set up for an outer construct. */
+ check_not_nested_p = false;
+ }
+ else
+ {
+ check_not_nested_p = true;
+ /* TODO: should we set "thr->prof_info" etc. in this case (acc_init)?
+ The problem is, that we don't have "thr" yet? (So,
+ "check_not_nested_p = true" also is pointless actually.) */
+ }
bool profiling_dispatch_p
- = __builtin_expect (goacc_profiling_dispatch_p (), false);
+ = __builtin_expect (goacc_profiling_dispatch_p (check_not_nested_p),
+ false);
acc_prof_info prof_info;
if (profiling_dispatch_p)
@@ -536,11 +551,21 @@ ialias (acc_shutdown)
int
acc_get_num_devices (acc_device_t d)
{
+#if 0 //TODO
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ prof_info.device_type = d; //TODO
+#endif
+
int n = 0;
struct gomp_device_descr *acc_dev;
if (d == acc_device_none)
- return 0;
+ goto out;
gomp_init_targets_once ();
@@ -549,12 +574,21 @@ acc_get_num_devices (acc_device_t d)
gomp_mutex_unlock (&acc_device_lock);
if (!acc_dev)
- return 0;
+ goto out;
n = acc_dev->get_num_devices_func ();
if (n < 0)
n = 0;
+ out:
+#if 0 //TODO
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+#endif
+
return n;
}
@@ -570,6 +604,14 @@ acc_set_device_type (acc_device_t d)
struct gomp_device_descr *base_dev, *acc_dev;
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ prof_info.device_type = d; //TODO
+
gomp_mutex_lock (&acc_device_lock);
if (!cached_base_dev)
@@ -595,6 +637,12 @@ acc_set_device_type (acc_device_t d)
}
goacc_attach_host_thread_to_device (-1);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
ialias (acc_set_device_type)
@@ -610,12 +658,25 @@ acc_get_device_type (void)
res = acc_device_type (thr->base_dev->type);
else
{
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr,
+ &prof_info, &api_info),
+ false);
+
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (acc_device_default, true);
gomp_mutex_unlock (&acc_device_lock);
res = acc_device_type (dev->type);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
assert (res != acc_device_default
@@ -632,6 +693,14 @@ acc_get_device_num (acc_device_t d)
const struct gomp_device_descr *dev;
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ prof_info.device_type = d; //TODO
+
if (d >= _ACC_device_hwm)
gomp_fatal ("unknown device type %u", (unsigned) d);
@@ -642,6 +711,12 @@ acc_get_device_num (acc_device_t d)
dev = resolve_device (d, true);
gomp_mutex_unlock (&acc_device_lock);
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
if (thr && thr->base_dev == dev && thr->dev)
return thr->dev->target_id;
@@ -653,6 +728,19 @@ ialias (acc_get_device_num)
void
acc_set_device_num (int ord, acc_device_t d)
{
+#if 0 //TODO
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.device_type = d; //TODO
+ prof_info.device_type = ord; //TODO
+ }
+#endif
+
struct gomp_device_descr *base_dev, *acc_dev;
int num_devices;
@@ -691,6 +779,14 @@ acc_set_device_num (int ord, acc_device_t d)
}
goacc_device_num = ord;
+
+#if 0 //TODO
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+#endif
}
ialias (acc_set_device_num)
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 8a620291425..7f8351684a9 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -110,7 +110,9 @@ void goacc_lazy_initialize (void);
void goacc_host_init (void);
void goacc_profiling_initialize (void);
-bool goacc_profiling_dispatch_p (void);
+bool goacc_profiling_setup_p (struct goacc_thread *,
+ acc_prof_info *, acc_api_info *);
+bool goacc_profiling_dispatch_p (bool);
void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
acc_api_info *);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 17e02b27ca8..fd0dac46e09 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -103,12 +103,30 @@ acc_malloc (size_t s)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
assert (thr->dev);
+ void *ret;
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
- return malloc (s);
+ {
+ /* TODO: Should we also generate acc_ev_alloc here? */
+ ret = malloc (s);
+ }
+ else
+ ret = thr->dev->alloc_func (thr->dev->target_id, s);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
- return thr->dev->alloc_func (thr->dev->target_id, s);
+ return ret;
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -124,12 +142,23 @@ acc_free (void *d)
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
assert (thr && thr->dev);
struct gomp_device_descr *acc_dev = thr->dev;
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
- return free (d);
+ {
+ /* TODO: Should we also generate acc_ev_free here? */
+ free (d);
+
+ goto out;
+ }
gomp_mutex_lock (&acc_dev->lock);
@@ -151,6 +180,13 @@ acc_free (void *d)
if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+
+ out:
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
static void
@@ -161,15 +197,31 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
assert (thr && thr->dev);
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
+ /* TODO: Should we also generate
+ acc_ev_enqueue_upload_start/acc_ev_enqueue_upload_end or
+ acc_ev_enqueue_download_start/acc_ev_enqueue_download_end here? */
if (from)
memmove (h, d, s);
else
memmove (d, h, s);
- return;
+
+ goto out;
}
if (async > acc_async_sync)
@@ -184,6 +236,13 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
if (!ret)
gomp_fatal ("error in %s", libfnname);
+
+ out:
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -228,6 +287,9 @@ acc_deviceptr (void *h)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&dev->lock);
n = lookup_host (dev, h, 1);
@@ -265,6 +327,9 @@ acc_hostptr (void *d)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return d;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
@@ -302,6 +367,9 @@ acc_is_present (void *h, size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h != NULL;
+ /* In the following, no OpenACC Profiling Interface events can possibly be
+ generated. */
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -333,6 +401,12 @@ acc_map_data (void *h, void *d, size_t s)
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
if (d != h)
@@ -372,6 +446,12 @@ acc_map_data (void *h, void *d, size_t s)
tgt->prev = acc_dev->openacc.data_environ;
acc_dev->openacc.data_environ = tgt;
gomp_mutex_unlock (&acc_dev->lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -386,6 +466,12 @@ acc_unmap_data (void *h)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+
size_t host_size;
gomp_mutex_lock (&acc_dev->lock);
@@ -436,6 +522,12 @@ acc_unmap_data (void *h)
gomp_mutex_unlock (&acc_dev->lock);
gomp_unmap_vars (t, true);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
#define FLAG_PRESENT (1 << 0)
@@ -459,6 +551,18 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -518,6 +622,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock);
}
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
+
return d;
}
@@ -582,6 +692,18 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
@@ -622,6 +744,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", libfnname);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -664,6 +792,18 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_mutex_lock (&acc_dev->lock);
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
n = lookup_host (acc_dev, h, s);
if (!n)
@@ -687,6 +827,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
acc_dev->openacc.async_set_async_func (acc_async_sync);
gomp_mutex_unlock (&acc_dev->lock);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index de70ac0b6aa..bff62ba5abf 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -143,7 +143,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
acc_dev = thr->dev;
bool profiling_dispatch_p
- = __builtin_expect (goacc_profiling_dispatch_p (), false);
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
acc_prof_info prof_info;
if (profiling_dispatch_p)
@@ -407,18 +407,86 @@ GOACC_data_start (int device, size_t mapnum,
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_enter_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1; //TODO
+ prof_info.async = acc_async_sync; /* Always synchronous. */
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ prof_info.src_file = NULL; //TODO
+ prof_info.func_name = NULL; //TODO
+ prof_info.line_no = -1; //TODO
+ prof_info.end_line_no = -1; //TODO
+ prof_info.func_line_no = -1; //TODO
+ prof_info.func_end_line_no = -1; //TODO
+ }
+ acc_event_info enter_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ enter_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ enter_data_event_info.other_event.parent_construct = acc_construct_data;
+ for (int i = 0; i < mapnum; ++i)
+ if (kinds[i] == GOMP_MAP_USE_DEVICE_PTR)
+ {
+ /* If there is one such data mapping kind, then this is actually an
+ OpenACC host_data construct. (GCC maps the OpenACC host_data
+ construct to the OpenACC data construct.) Apart from artificial
+ test cases (such as an OpenACC host_data construct's (implicit)
+ device initialization when there hasn't been any device data be
+ set up before...), there can't really any meaningful events be
+ generated from OpenACC host_data constructs, though. */
+ enter_data_event_info.other_event.parent_construct
+ = acc_construct_host_data;
+ break;
+ }
+ enter_data_event_info.other_event.implicit = 0;
+ enter_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1; //TODO
+ api_info.device_handle = NULL; //TODO
+ api_info.context_handle = NULL; //TODO
+ api_info.async_handle = NULL; //TODO
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
/* Host fallback or 'do nothing'. */
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| host_fallback)
{
+ //TODO
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
GOMP_MAP_VARS_OPENACC);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
-
- return;
+ goto out;
}
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
@@ -427,18 +495,92 @@ GOACC_data_start (int device, size_t mapnum,
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_enter_data_end;
+ enter_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
GOACC_data_end (void)
{
struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *acc_dev = thr->dev;
struct target_mem_desc *tgt = thr->mapped_data;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_exit_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1; //TODO
+ prof_info.async = acc_async_sync; /* Always synchronous. */
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ prof_info.src_file = NULL; //TODO
+ prof_info.func_name = NULL; //TODO
+ prof_info.line_no = -1; //TODO
+ prof_info.end_line_no = -1; //TODO
+ prof_info.func_line_no = -1; //TODO
+ prof_info.func_end_line_no = -1; //TODO
+ }
+ acc_event_info exit_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ exit_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ exit_data_event_info.other_event.parent_construct = acc_construct_data;
+ exit_data_event_info.other_event.implicit = 0;
+ exit_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1; //TODO
+ api_info.device_handle = NULL; //TODO
+ api_info.context_handle = NULL; //TODO
+ api_info.async_handle = NULL; //TODO
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
gomp_debug (0, " %s: restore mappings\n", __FUNCTION__);
thr->mapped_data = tgt->prev;
gomp_unmap_vars (tgt, true);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
+
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_exit_data_end;
+ exit_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
@@ -452,26 +594,6 @@ GOACC_enter_exit_data (int device, size_t mapnum,
bool data_enter = false;
size_t i;
- goacc_lazy_initialize ();
-
- thr = goacc_thread ();
- acc_dev = thr->dev;
-
- if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
- || host_fallback)
- return;
-
- if (num_waits)
- {
- va_list ap;
-
- va_start (ap, num_waits);
- goacc_wait (async, num_waits, &ap);
- va_end (ap);
- }
-
- acc_dev->openacc.async_set_async_func (async);
-
/* Determine if this is an "acc enter data". */
for (i = 0; i < mapnum; ++i)
{
@@ -501,6 +623,86 @@ GOACC_enter_exit_data (int device, size_t mapnum,
kind);
}
+ goacc_lazy_initialize ();
+
+ thr = goacc_thread ();
+ acc_dev = thr->dev;
+
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type
+ = data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1; //TODO
+ prof_info.async = async;
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ prof_info.src_file = NULL; //TODO
+ prof_info.func_name = NULL; //TODO
+ prof_info.line_no = -1; //TODO
+ prof_info.end_line_no = -1; //TODO
+ prof_info.func_line_no = -1; //TODO
+ prof_info.func_end_line_no = -1; //TODO
+ }
+ acc_event_info enter_exit_data_event_info;
+ if (profiling_dispatch_p)
+ {
+ enter_exit_data_event_info.other_event.event_type
+ = prof_info.event_type;
+ enter_exit_data_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ enter_exit_data_event_info.other_event.parent_construct
+ = data_enter ? acc_construct_enter_data : acc_construct_exit_data;
+ enter_exit_data_event_info.other_event.implicit = 0;
+ enter_exit_data_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1; //TODO
+ api_info.device_handle = NULL; //TODO
+ api_info.context_handle = NULL; //TODO
+ api_info.async_handle = NULL; //TODO
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+
+ if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ || host_fallback)
+ {
+ //TODO
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
+ goto out;
+ }
+
+ if (num_waits)
+ {
+ va_list ap;
+
+ va_start (ap, num_waits);
+ goacc_wait (async, num_waits, &ap);
+ va_end (ap);
+ }
+
+ acc_dev->openacc.async_set_async_func (async);
+
/* In c, non-pointers and arrays are represented by a single data clause.
Dynamically allocated arrays and subarrays are represented by a data
clause followed by an internal GOMP_MAP_POINTER.
@@ -603,6 +805,18 @@ GOACC_enter_exit_data (int device, size_t mapnum,
}
acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = data_enter ? acc_ev_enter_data_end: acc_ev_exit_data_end;
+ enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+ &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
static void
@@ -642,9 +856,66 @@ GOACC_update (int device, size_t mapnum,
struct goacc_thread *thr = goacc_thread ();
struct gomp_device_descr *acc_dev = thr->dev;
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+ acc_prof_info prof_info;
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = &prof_info;
+
+ prof_info.event_type = acc_ev_update_start;
+ prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info.version = _ACC_PROF_INFO_VERSION;
+ prof_info.device_type = acc_device_type (acc_dev->type);
+ prof_info.device_number = acc_dev->target_id;
+ prof_info.thread_id = -1; //TODO
+ prof_info.async = async;
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ prof_info.src_file = NULL; //TODO
+ prof_info.func_name = NULL; //TODO
+ prof_info.line_no = -1; //TODO
+ prof_info.end_line_no = -1; //TODO
+ prof_info.func_line_no = -1; //TODO
+ prof_info.func_end_line_no = -1; //TODO
+ }
+ acc_event_info update_event_info;
+ if (profiling_dispatch_p)
+ {
+ update_event_info.other_event.event_type
+ = prof_info.event_type;
+ update_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ update_event_info.other_event.parent_construct = acc_construct_update;
+ update_event_info.other_event.implicit = 0;
+ update_event_info.other_event.tool_info = NULL;
+ }
+ acc_api_info api_info;
+ if (profiling_dispatch_p)
+ {
+ thr->api_info = &api_info;
+
+ api_info.device_api = acc_device_api_none;
+ api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info.device_type = prof_info.device_type;
+ api_info.vendor = -1; //TODO
+ api_info.device_handle = NULL; //TODO
+ api_info.context_handle = NULL; //TODO
+ api_info.async_handle = NULL; //TODO
+ }
+
+ if (profiling_dispatch_p)
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|| host_fallback)
- return;
+ {
+ //TODO
+ prof_info.device_type = acc_device_host;
+ api_info.device_type = prof_info.device_type;
+ goto out;
+ }
if (num_waits)
{
@@ -718,11 +989,41 @@ GOACC_update (int device, size_t mapnum,
}
acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+ out:
+ if (profiling_dispatch_p)
+ {
+ prof_info.event_type = acc_ev_update_end;
+ update_event_info.other_event.event_type = prof_info.event_type;
+ goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
void
GOACC_wait (int async, int num_waits, ...)
{
+ goacc_lazy_initialize ();
+
+ struct goacc_thread *thr = goacc_thread ();
+
+ /* No nesting. */
+ assert (thr->prof_info == NULL);
+ assert (thr->api_info == NULL);
+ acc_prof_info prof_info;
+ acc_api_info api_info;
+ bool profiling_setup_p
+ = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info),
+ false);
+ if (profiling_setup_p)
+ {
+ prof_info.async = async;
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info.async_queue = prof_info.async;
+ }
+
if (num_waits)
{
va_list ap;
@@ -734,7 +1035,13 @@ GOACC_wait (int async, int num_waits, ...)
else if (async == acc_async_sync)
acc_wait_all ();
else if (async == acc_async_noval)
- goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval);
+ thr->dev->openacc.async_wait_all_async_func (acc_async_noval);
+
+ if (profiling_setup_p)
+ {
+ thr->prof_info = NULL;
+ thr->api_info = NULL;
+ }
}
int
diff --git a/libgomp/oacc-profiling.c b/libgomp/oacc-profiling.c
index a4671f9ceb9..35d652c6561 100644
--- a/libgomp/oacc-profiling.c
+++ b/libgomp/oacc-profiling.c
@@ -485,10 +485,90 @@ acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
gomp_mutex_unlock (&goacc_prof_lock);
}
+/* Set up to dispatch events? */
+
+bool
+goacc_profiling_setup_p (struct goacc_thread *thr,
+ acc_prof_info *prof_info, acc_api_info *api_info)
+{
+ //TODO
+ gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
+
+ /* If we don't have any per-thread state yet, we can't register prof_info and
+ api_info. */
+ /* TODO: In this case, should we actually call goacc_lazy_initialize here,
+ and return the "thr" from goacc_profiling_setup_p? */
+ if (__builtin_expect (thr == NULL, false))
+ {
+ //TODO
+ gomp_debug (0, "Can't generate OpenACC Profiling Interface events for"
+ " the current call, construct, or directive\n");
+ return false;
+ }
+
+ bool profiling_dispatch_p
+ = __builtin_expect (goacc_profiling_dispatch_p (false), false);
+ if (thr->prof_info != NULL)
+ {
+ assert (profiling_dispatch_p); //TODO
+ /* Profiling has already been set up for an outer construct. In this
+ case, we continue to use the existing information, and thus return
+ "false" here.
+
+ This can happen, for example, for an enter data directive, which sets
+ up profiling, then calls into acc_copyin, which should not again set
+ up profiling, should not overwrite the existing information. */
+ //TODO: Is this all kosher?
+ return false;
+ }
+
+ if (profiling_dispatch_p)
+ {
+ thr->prof_info = prof_info;
+
+ prof_info->event_type = -1; /* Must be set later. */
+ prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+ prof_info->version = _ACC_PROF_INFO_VERSION;
+ //TODO
+ if (thr->dev)
+ {
+ prof_info->device_type = acc_device_type (thr->dev->type);
+ prof_info->device_number = thr->dev->target_id;
+ }
+ else
+ {
+ prof_info->device_type = -1;
+ prof_info->device_number = -1;
+ }
+ prof_info->thread_id = -1; //TODO
+ prof_info->async = acc_async_sync; //TODO
+ /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */
+ prof_info->async_queue = prof_info->async;
+ prof_info->src_file = NULL; //TODO
+ prof_info->func_name = NULL; //TODO
+ prof_info->line_no = -1; //TODO
+ prof_info->end_line_no = -1; //TODO
+ prof_info->func_line_no = -1; //TODO
+ prof_info->func_end_line_no = -1; //TODO
+
+ thr->api_info = api_info;
+
+ api_info->device_api = acc_device_api_none; //TODO
+ api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
+ api_info->device_type = prof_info->device_type;
+ api_info->vendor = -1; //TODO
+ api_info->device_handle = NULL; //TODO
+ api_info->context_handle = NULL; //TODO
+ api_info->async_handle = NULL; //TODO
+ }
+
+ return profiling_dispatch_p;
+}
+
/* Prepare to dispatch events? */
bool
-goacc_profiling_dispatch_p (void)
+goacc_profiling_dispatch_p (bool check_not_nested_p)
{
//TODO
gomp_debug (0, "%s\n", __FUNCTION__);
@@ -504,11 +584,21 @@ goacc_profiling_dispatch_p (void)
//TODO
gomp_debug (0, " %s: don't have any per-thread state yet\n", __FUNCTION__);
}
- else if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+ else
{
- //TODO
- gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__);
- return false;
+ if (check_not_nested_p)
+ {
+ /* No nesting. */
+ assert (thr->prof_info == NULL);
+ assert (thr->api_info == NULL);
+ }
+
+ if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+ {
+ //TODO
+ gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__);
+ return false;
+ }
}
gomp_mutex_lock (&goacc_prof_lock);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index dbea9da07fd..a9d1f163799 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1275,10 +1275,38 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
api_info);
}
+ acc_event_info wait_event_info;
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_wait_start;
+
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ wait_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ wait_event_info.other_event.parent_construct
+ /* TODO = compute_construct_event_info.other_event.parent_construct */
+ = acc_construct_parallel; //TODO: kernels...
+ wait_event_info.other_event.implicit = 1;
+ wait_event_info.other_event.tool_info = NULL;
+
+ api_info->device_api = acc_device_api_cuda;
+ }
#ifndef DISABLE_ASYNC
if (async < acc_async_noval)
{
+ if (profiling_dispatch_p)
+ {
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
r = cuStreamSynchronize (dev_str->stream);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_wait_end;
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@@ -1305,7 +1333,19 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
}
#else
+ if (profiling_dispatch_p)
+ {
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
r = cuCtxSynchronize ();
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_wait_end;
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@@ -1664,7 +1704,44 @@ nvptx_wait (int async)
GOMP_PLUGIN_debug (0, " %s: waiting on async=%d\n", __FUNCTION__, async);
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ bool profiling_dispatch_p
+ = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+ acc_event_info wait_event_info;
+ if (profiling_dispatch_p)
+ {
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_api_info *api_info = thr->api_info;
+
+ prof_info->event_type = acc_ev_wait_start;
+
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ wait_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ wait_event_info.other_event.parent_construct
+ /* TODO = compute_construct_event_info.other_event.parent_construct */
+ = acc_construct_parallel; //TODO: kernels...
+ wait_event_info.other_event.implicit = 1;
+ wait_event_info.other_event.tool_info = NULL;
+
+ api_info->device_api = acc_device_api_cuda;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
+ if (profiling_dispatch_p)
+ {
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_api_info *api_info = thr->api_info;
+
+ prof_info->event_type = acc_ev_wait_end;
+
+ wait_event_info.other_event.event_type = prof_info->event_type;
+
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &wait_event_info,
+ api_info);
+ }
event_gc (true);
}
@@ -1706,10 +1783,28 @@ nvptx_wait_all (void)
CUresult r;
struct ptx_stream *s;
pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
+ struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+ struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+ acc_prof_info *prof_info = thr->prof_info;
+ acc_event_info wait_event_info;
+ acc_api_info *api_info = thr->api_info;
+ bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+ if (profiling_dispatch_p)
+ {
+ wait_event_info.other_event.valid_bytes
+ = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+ wait_event_info.other_event.parent_construct
+ /* TODO = compute_construct_event_info.other_event.parent_construct */
+ = acc_construct_parallel; //TODO: kernels...
+ wait_event_info.other_event.implicit = 1;
+ wait_event_info.other_event.tool_info = NULL;
+
+ api_info->device_api = acc_device_api_cuda;
+ }
+
/* Wait for active streams initiated by this thread (or by multiple threads)
to complete. */
for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
@@ -1722,7 +1817,23 @@ nvptx_wait_all (void)
else if (r != CUDA_ERROR_NOT_READY)
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_wait_start;
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+ &wait_event_info,
+ api_info);
+ }
CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
+ if (profiling_dispatch_p)
+ {
+ prof_info->event_type = acc_ev_wait_end;
+ wait_event_info.other_event.event_type = prof_info->event_type;
+ GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+ &wait_event_info,
+ api_info);
+ }
}
}