diff options
Diffstat (limited to 'parallel-libs/streamexecutor/examples/CUDASaxpy.cpp')
-rw-r--r-- | parallel-libs/streamexecutor/examples/CUDASaxpy.cpp | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp index eab0cbe69d6..5fb3dba26a7 100644 --- a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp +++ b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp @@ -115,6 +115,11 @@ int main() { cg::SaxpyKernel Kernel = getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec)); + se::RegisteredHostMemory<float> RegisteredX = + getOrDie(Device->registerHostMemory<float>(HostX)); + se::RegisteredHostMemory<float> RegisteredY = + getOrDie(Device->registerHostMemory<float>(HostY)); + // Allocate memory on the device. se::GlobalDeviceMemory<float> X = getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); @@ -123,10 +128,10 @@ int main() { // Run operations on a stream. se::Stream Stream = getOrDie(Device->createStream()); - Stream.thenCopyH2D<float>(HostX, X) - .thenCopyH2D<float>(HostY, Y) + Stream.thenCopyH2D(RegisteredX, X) + .thenCopyH2D(RegisteredY, Y) .thenLaunch(ArraySize, 1, Kernel, A, X, Y) - .thenCopyD2H<float>(X, HostX); + .thenCopyD2H(X, RegisteredX); // Wait for the stream to complete. se::dieIfError(Stream.blockHostUntilDone()); |