summaryrefslogtreecommitdiff
path: root/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'parallel-libs/streamexecutor/examples/CUDASaxpy.cpp')
-rw-r--r--parallel-libs/streamexecutor/examples/CUDASaxpy.cpp11
1 files changed, 8 insertions, 3 deletions
diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
index eab0cbe69d6..5fb3dba26a7 100644
--- a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
+++ b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
@@ -115,6 +115,11 @@ int main() {
cg::SaxpyKernel Kernel =
getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+ se::RegisteredHostMemory<float> RegisteredX =
+ getOrDie(Device->registerHostMemory<float>(HostX));
+ se::RegisteredHostMemory<float> RegisteredY =
+ getOrDie(Device->registerHostMemory<float>(HostY));
+
// Allocate memory on the device.
se::GlobalDeviceMemory<float> X =
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
@@ -123,10 +128,10 @@ int main() {
// Run operations on a stream.
se::Stream Stream = getOrDie(Device->createStream());
- Stream.thenCopyH2D<float>(HostX, X)
- .thenCopyH2D<float>(HostY, Y)
+ Stream.thenCopyH2D(RegisteredX, X)
+ .thenCopyH2D(RegisteredY, Y)
.thenLaunch(ArraySize, 1, Kernel, A, X, Y)
- .thenCopyD2H<float>(X, HostX);
+ .thenCopyD2H(X, RegisteredX);
// Wait for the stream to complete.
se::dieIfError(Stream.blockHostUntilDone());