summaryrefslogtreecommitdiff
path: root/parallel-libs/streamexecutor/include/streamexecutor/DeviceMemory.h
blob: b3b0fd2faf2a7d13c5770c04ed88e34ffc727231 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
//===-- DeviceMemory.h - Types representing device memory -------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file defines types that represent device memory buffers. Two memory
/// spaces are represented here: global and shared. Host code can have a handle
/// to device global memory, and that handle can be used to copy data to and
/// from the device. Host code cannot have a handle to device shared memory
/// because that memory only exists during the execution of a kernel.
///
/// GlobalDeviceMemoryBase is similar to a pair consisting of a void* pointer
/// and a byte count to tell how much memory is pointed to by that void*.
///
/// GlobalDeviceMemory<T> is a subclass of GlobalDeviceMemoryBase which keeps
/// track of the type of element to be stored in the device array. It is similar
/// to a pair of a T* pointer and an element count to tell how many elements of
/// type T fit in the memory pointed to by that T*.
///
/// SharedDeviceMemoryBase is just the size in bytes of a shared memory buffer.
///
/// SharedDeviceMemory<T> is a subclass of SharedDeviceMemoryBase which knows
/// how many elements of type T it can hold.
///
/// These classes are useful for keeping track of which memory space a buffer
/// lives in, and the typed subclasses are useful for type-checking.
///
/// The typed subclass will be used by user code, and the untyped base classes
/// will be used for type-unsafe operations inside of StreamExecutor.
///
//===----------------------------------------------------------------------===//

#ifndef STREAMEXECUTOR_DEVICEMEMORY_H
#define STREAMEXECUTOR_DEVICEMEMORY_H

#include <cstddef>

namespace streamexecutor {

/// Wrapper around a generic global device memory allocation.
///
/// This class represents a buffer of untyped bytes in the global memory space
/// of a device. See GlobalDeviceMemory<T> for the corresponding type that
/// includes type information for the elements in its buffer.
///
/// This is effectively a pair consisting of an opaque handle and a buffer size
/// in bytes. The opaque handle is a platform-dependent handle to the actual
/// memory that is allocated on the device.
///
/// In some cases, such as in the CUDA platform, the opaque handle may actually
/// be a pointer in the virtual address space and it may be valid to perform
/// arithmetic on it to obtain other device pointers, but this is not the case
/// in general.
///
/// For example, in the OpenCL platform, the handle is a pointer to a _cl_mem
/// handle object which really is completely opaque to the user.
///
/// The only fully platform-generic operations on handles are using them to
/// create new GlobalDeviceMemoryBase objects, and comparing them to each other
/// for equality.
class GlobalDeviceMemoryBase {
public:
  /// Creates a GlobalDeviceMemoryBase from an optional handle and an optional
  /// byte count.
  explicit GlobalDeviceMemoryBase(const void *Handle = nullptr,
                                  size_t ByteCount = 0)
      : Handle(Handle), ByteCount(ByteCount) {}

  /// Copyable like a pointer.
  GlobalDeviceMemoryBase(const GlobalDeviceMemoryBase &) = default;

  /// Copy-assignable like a pointer.
  GlobalDeviceMemoryBase &operator=(const GlobalDeviceMemoryBase &) = default;

  /// Returns the size, in bytes, for the backing memory.
  size_t getByteCount() const { return ByteCount; }

  /// Gets the internal handle.
  ///
  /// Warning: note that the pointer returned is not necessarily directly to
  /// device virtual address space, but is platform-dependent.
  const void *getHandle() const { return Handle; }

private:
  const void *Handle; // Platform-dependent value representing allocated memory.
  size_t ByteCount;   // Size in bytes of this allocation.
};

/// Typed wrapper around the "void *"-like GlobalDeviceMemoryBase class.
///
/// For example, GlobalDeviceMemory<int> is a simple wrapper around
/// GlobalDeviceMemoryBase that represents a buffer of integers stored in global
/// device memory.
template <typename ElemT>
class GlobalDeviceMemory : public GlobalDeviceMemoryBase {
public:
  /// Creates a typed area of GlobalDeviceMemory with a given opaque handle and
  /// the given element count.
  static GlobalDeviceMemory<ElemT> makeFromElementCount(const void *Handle,
                                                        size_t ElementCount) {
    return GlobalDeviceMemory<ElemT>(Handle, ElementCount);
  }

  /// Creates a typed device memory region from an untyped device memory region.
  ///
  /// This effectively amounts to a cast from a void* to an ElemT*, but it also
  /// manages the difference in the size measurements when
  /// GlobalDeviceMemoryBase is measured in bytes and GlobalDeviceMemory is
  /// measured in elements.
  explicit GlobalDeviceMemory(const GlobalDeviceMemoryBase &Other)
      : GlobalDeviceMemoryBase(Other.getHandle(), Other.getByteCount()) {}

  /// Copyable like a pointer.
  GlobalDeviceMemory(const GlobalDeviceMemory &) = default;

  /// Copy-assignable like a pointer.
  GlobalDeviceMemory &operator=(const GlobalDeviceMemory &) = default;

  /// Returns the number of elements of type ElemT that constitute this
  /// allocation.
  size_t getElementCount() const { return getByteCount() / sizeof(ElemT); }

private:
  /// Constructs a GlobalDeviceMemory instance from an opaque handle and an
  /// element count.
  ///
  /// This constructor is not public because there is a potential for confusion
  /// between the size of the buffer in bytes and the size of the buffer in
  /// elements.
  ///
  /// The static method makeFromElementCount is provided for users of this class
  /// because its name makes the meaning of the size parameter clear.
  GlobalDeviceMemory(const void *Handle, size_t ElementCount)
      : GlobalDeviceMemoryBase(Handle, ElementCount * sizeof(ElemT)) {}
};

/// A class to represent the size of a dynamic shared memory buffer on a device.
///
/// This class maintains no information about the types to be stored in the
/// buffer. For the typed version of this class see SharedDeviceMemory<ElemT>.
///
/// Shared memory buffers exist only on the device and cannot be manipulated
/// from the host, so instances of this class do not have an opaque handle, only
/// a size.
///
/// This type of memory is called "local" memory in OpenCL and "shared" memory
/// in CUDA, and both platforms follow the rule that the host code only knows
/// the size of these buffers and does not have a handle to them.
///
/// The treatment of shared memory in StreamExecutor matches the way it is done
/// in OpenCL, where a kernel takes any number of shared memory sizes as kernel
/// function arguments.
///
/// In CUDA only one shared memory size argument is allowed per kernel call.
/// StreamExecutor handles this by allowing CUDA kernel signatures that take
/// multiple SharedDeviceMemory arguments, and simply adding together all the
/// shared memory sizes to get the final shared memory size that is used to
/// launch the kernel.
class SharedDeviceMemoryBase {
public:
  /// Creates an untyped shared memory array from a byte count.
  SharedDeviceMemoryBase(size_t ByteCount) : ByteCount(ByteCount) {}

  /// Copyable because it is just an array size.
  SharedDeviceMemoryBase(const SharedDeviceMemoryBase &) = default;

  /// Copy-assignable because it is just an array size.
  SharedDeviceMemoryBase &operator=(const SharedDeviceMemoryBase &) = default;

  /// Gets the byte count.
  size_t getByteCount() const { return ByteCount; }

private:
  size_t ByteCount;
};

/// Typed wrapper around the untyped SharedDeviceMemoryBase class.
///
/// For example, SharedDeviceMemory<int> is a wrapper around
/// SharedDeviceMemoryBase that represents a buffer of integers stored in shared
/// device memory.
template <typename ElemT>
class SharedDeviceMemory : public SharedDeviceMemoryBase {
public:
  /// Creates a typed area of shared device memory with a given number of
  /// elements.
  static SharedDeviceMemory<ElemT> makeFromElementCount(size_t ElementCount) {
    return SharedDeviceMemory(ElementCount);
  }

  /// Copyable because it is just an array size.
  SharedDeviceMemory(const SharedDeviceMemory &) = default;

  /// Copy-assignable because it is just an array size.
  SharedDeviceMemory &operator=(const SharedDeviceMemory &) = default;

  /// Returns the number of elements of type ElemT that can fit this memory
  /// buffer.
  size_t getElementCount() const { return getByteCount() / sizeof(ElemT); }

  /// Returns whether this is a single-element memory buffer.
  bool isScalar() const { return getElementCount() == 1; }

private:
  /// Constructs a SharedDeviceMemory instance from an element count.
  ///
  /// This constructor is not public because there is a potential for confusion
  /// between the size of the buffer in bytes and the size of the buffer in
  /// elements.
  ///
  /// The static method makeFromElementCount is provided for users of this class
  /// because its name makes the meaning of the size parameter clear.
  explicit SharedDeviceMemory(size_t ElementCount)
      : SharedDeviceMemoryBase(ElementCount * sizeof(ElemT)) {}
};

} // namespace streamexecutor

#endif // STREAMEXECUTOR_DEVICEMEMORY_H