parallel-libs/streamexecutor/examples/CUDASaxpy.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains example code demonstrating the usage of the
/// StreamExecutor API. Snippets of this file will be included as code examples
/// in documentation. Taking these examples from a real source file guarantees
/// that the examples will always compile.
///
//===----------------------------------------------------------------------===//

#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <vector>

#include "streamexecutor/StreamExecutor.h"

/// [Example saxpy compiler-generated]
// Code in this namespace is generated by the compiler (e.g. clang).
//
// The name of this namespace may depend on the compiler that generated it, so
// this is just an example name.
namespace __compilergen {

// Specialization of the streamexecutor::Kernel template class for the parameter
// types of the saxpy(float A, float *X, float *Y) kernel.
using SaxpyKernel =
    streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
                           streamexecutor::GlobalDeviceMemory<float>>;

// A string containing the PTX code generated by the device compiler for the
// saxpy kernel. String contents not shown here.
extern const char *SaxpyPTX;

// A global instance of a loader spec that knows how to load the code in the
// SaxpyPTX string.
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
  streamexecutor::MultiKernelLoaderSpec Spec;
  Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
  return Spec;
}();

} // namespace __compilergen
/// [Example saxpy compiler-generated]

/// [Example saxpy host PTX]
// The PTX text for a saxpy kernel.
const char *__compilergen::SaxpyPTX = R"(
  .version 4.3
  .target sm_20
  .address_size 64

  .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
    .reg .f32 %AValue;
    .reg .f32 %XValue;
    .reg .f32 %YValue;
    .reg .f32 %Result;

    .reg .b64 %XBaseAddrGeneric;
    .reg .b64 %YBaseAddrGeneric;
    .reg .b64 %XBaseAddrGlobal;
    .reg .b64 %YBaseAddrGlobal;
    .reg .b64 %XAddr;
    .reg .b64 %YAddr;
    .reg .b64 %ThreadByteOffset;

    .reg .b32 %TID;

    ld.param.f32 %AValue, [A];
    ld.param.u64 %XBaseAddrGeneric, [X];
    ld.param.u64 %YBaseAddrGeneric, [Y];
    cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
    cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
    mov.u32 %TID, %tid.x;
    mul.wide.u32 %ThreadByteOffset, %TID, 4;
    add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
    add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
    ld.global.f32 %XValue, [%XAddr];
    ld.global.f32 %YValue, [%YAddr];
    fma.rn.f32 %Result, %AValue, %XValue, %YValue;
    st.global.f32 [%XAddr], %Result;
    ret;
  }
)";
/// [Example saxpy host PTX]

int main() {
  /// [Example saxpy host main]
  namespace se = ::streamexecutor;
  namespace cg = ::__compilergen;

  // Create some host data.
  float A = 42.0f;
  std::vector<float> HostX = {0, 1, 2, 3};
  std::vector<float> HostY = {4, 5, 6, 7};
  size_t ArraySize = HostX.size();

  // Get a device object.
  se::Platform *Platform =
      getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
  if (Platform->getDeviceCount() == 0) {
    return EXIT_FAILURE;
  }
  se::Device *Device = getOrDie(Platform->getDevice(0));

  // Load the kernel onto the device.
  cg::SaxpyKernel Kernel =
      getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));

  se::RegisteredHostMemory<float> RegisteredX =
      getOrDie(Device->registerHostMemory<float>(HostX));
  se::RegisteredHostMemory<float> RegisteredY =
      getOrDie(Device->registerHostMemory<float>(HostY));

  // Allocate memory on the device.
  se::GlobalDeviceMemory<float> X =
      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
  se::GlobalDeviceMemory<float> Y =
      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));

  // Run operations on a stream.
  se::Stream Stream = getOrDie(Device->createStream());
  Stream.thenCopyH2D(RegisteredX, X)
      .thenCopyH2D(RegisteredY, Y)
      .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
      .thenCopyD2H(X, RegisteredX);
  // Wait for the stream to complete.
  se::dieIfError(Stream.blockHostUntilDone());

  // Process output data in HostX.
  std::vector<float> ExpectedX = {4, 47, 90, 133};
  assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
  /// [Example saxpy host main]
}