1 /**
2    CUDA wrapper module
3  */
4 module grain.cuda;
5 version(grain_cuda):
6 
7 import std.traits : ReturnType, arity;
8 import std.typecons : RefCounted;
9 import std.stdio : writeln, writefln;
10 import std..string : toStringz, fromStringz;
11 
12 import derelict.cuda;
13 import derelict.cudnn7;
14 import grain.cublas;
15 import grain.utility;
16 
17 
18 
19 
20 // TODO: support multiple GPU devices (context)
21 __gshared CUcontext context;
22 __gshared cublasHandle_t cublasHandle;
23 __gshared cudnnHandle_t cudnnHandle;
24 
25 /// global cuda init
26 shared static this() {
27     // Initialize the driver API
28     DerelictCUDADriver.load();
29     CUdevice device;
30     cuInit(0);
31     // Get a handle to the first compute device
32     cuDeviceGet(&device, 0);
33     // Create a compute device context
34     cuCtxCreate(&context, 0, device);
35 
36 
37     // init CUDA libraries
38     checkCublasErrors(cublasCreate_v2(&cublasHandle));
39     DerelictCuDNN7.load();
40     checkCUDNN( cudnnCreate(&cudnnHandle) );
41 }
42 
43 /// global cuda exit
44 shared static ~this() {
45     import core.memory : GC;
46     GC.collect();
47     cublasDestroy_v2(cublasHandle);
48     checkCUDNN( cudnnDestroy(cudnnHandle) );
49     checkCudaErrors(cuCtxDestroy(context));
50 }
51 
52 /// cuda module compiled from ptx string
53 struct CuModule {
54     CUmodule cuModule;
55 
56     ///
57     this(string ptxstr) {
58         // JIT compile a null-terminated PTX string
59         checkCudaErrors(cuModuleLoadData(&cuModule, cast(void*) ptxstr.toStringz));
60     }
61 
62     ///
63     ~this() {
64         checkCudaErrors(cuModuleUnload(cuModule));
65     }
66 
67     ///
68     auto kernel(alias F)() {
69         return Kernel!F(cuModule);
70     }
71 }
72 
73 /// global accessor for the cuda module in grain
74 class Global {
75     import K = grain.kernel;
76     private this() {}
77 
78     // Cache instantiation flag in thread-local bool
79     // Thread local
80     private static bool instantiated_ = false, cxxInstantiated_ = false;
81 
82     // Thread global
83     private __gshared CuModule* module_, cxxModule_;
84 
85     ///
86     static get() {
87         if (!instantiated_) {
88             synchronized(Global.classinfo) {
89                 module_ = new CuModule(K.cxxptx);
90                 instantiated_ = true;
91             }
92         }
93         return module_;
94     }
95 
96     ///
97     static getCxx() {
98         if (!cxxInstantiated_) {
99             synchronized(Global.classinfo) {
100                 cxxModule_ = new CuModule(K.cxxptx);
101                 cxxInstantiated_ = true;
102             }
103         }
104         return cxxModule_;
105     }
106 
107     ///
108     static cxxKernel(T...)(string name, T args) {
109         CUfunction cuFunction;
110         writeln("getFunction...");
111         checkCudaErrors(cuModuleGetFunction(&cuFunction, getCxx(), name.toStringz));
112         writeln("getFunction...");
113         return Launcher!T(cuFunction, args);
114     }
115 
116     ///
117     static kernel(alias F)() {
118         return get().kernel!F;
119     }
120 }
121 
122 /// ditto
123 auto global() {
124     return Global.get();
125 }
126 
127 // pthread error ?
128 // auto CUDA_POST_KERNEL_CHECK() {
129 //     checkCudaErrors(cudaPeekAtLastError());
130 // }
131 
132 /// cuda kernel function launcher with runtime numbers of blocks/threads
133 struct Launcher(Args...) {
134     CUfunction cuFunction;
135     Args args;
136 
137     /// create kernel function as void[Args.length]
138     auto kernelParams(T...)(T args) {
139         void*[args.length] ret;
140         foreach (i, a; args) {
141             ret[i] = &a;
142         }
143         return ret;
144     }
145 
146     /// detailed launch function
147     void launch(uint[3] grid, uint[3] block, uint sharedMemBytes=0, CUstream stream=null) {
148         checkCudaErrors(cuLaunchKernel(
149                             cuFunction,
150                             grid[0], grid[1], grid[2],
151                             block[0], block[1], block[2],
152                             sharedMemBytes, stream,
153                             kernelParams(args).ptr, null));
154         // CUDA_POST_KERNEL_CHECK();
155     }
156 
157     // TODO __CUDA_ARCH__ < 200 512
158     enum CUDA_NUM_THREADS = 1024;
159 
160     static getBlocks(uint n) {
161         return (n + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
162     }
163 
164     /// convinient launch function
165     void launch(uint n=1, uint sharedMemBytes=0, CUstream stream=null) {
166         checkCudaErrors(cuLaunchKernel(
167                             cuFunction,
168                             getBlocks(n), 1, 1,
169                             CUDA_NUM_THREADS, 1, 1,
170                             sharedMemBytes, stream,
171                             kernelParams(args).ptr, null));
172         // CUDA_POST_KERNEL_CHECK();
173     }
174 }
175 
176 
177 /// cuda function object called by mangled name of C++/D device function F
178 struct Kernel(alias F) if (is(ReturnType!F == void)) {
179     // enum name = __traits(identifier, F);
180     enum name = F.mangleof;
181     CUfunction cuFunction;
182 
183     ///
184     this(CUmodule m) {
185         // writeln("mangled: ", name);
186         checkCudaErrors(cuModuleGetFunction(&cuFunction, m, name.toStringz));
187     }
188 
189     // TODO: compile-time type check like d-nv
190     // TODO: separate this to struct Launcher
191     auto call(T...)(T args) {
192         static assert(args.length == arity!F);
193         // Kernel launch
194         // checkCudaErrors(cuCtxSynchronize());
195         return Launcher!T(cuFunction, args);
196     }
197 }
198 
199 /// alias to element type of cuda storage
200 alias CudaElementType(M : CuPtr!T, T) = T;
201 /// ditto
202 alias CudaElementType(M : CuArray!T, T) = T;
203 /// ditto
204 alias CudaElementType(M : RefCounted!(CuPtr!T), T) = T;
205 /// ditto
206 alias CudaElementType(M : RefCounted!(CuArray!T), T) = T;
207 
208 /// trait to identify cuda storage
209 enum bool isDeviceMemory(T) = is(typeof({
210             static assert(is(typeof(T.init.ptr) == CUdeviceptr));
211             static assert(is(typeof(T.init.length) == const(size_t)));
212         }));
213 
214 ///
215 unittest {
216     static assert(is(typeof(CuArray!float.init.ptr) == CUdeviceptr));
217     static assert(is(typeof(CuArray!float.init.length) == const(size_t)));
218     static assert(isDeviceMemory!(CuPtr!float));
219     static assert(isDeviceMemory!(CuArray!float));
220     static assert(isDeviceMemory!(RefCounted!(CuPtr!float)));
221     static assert(isDeviceMemory!(RefCounted!(CuArray!float)));
222     static assert(is(CudaElementType!(CuPtr!float) == float));
223     static assert(is(CudaElementType!(CuArray!float) == float));
224     static assert(is(CudaElementType!(RefCounted!(CuPtr!float)) == float));
225     static assert(is(CudaElementType!(RefCounted!(CuArray!float)) == float));
226 }
227 
228 /// true if length == 0
229 bool empty(M)(M m) if (isDeviceMemory!M) {
230     return m.length == 0;
231 }
232 
233 /// copy device memory to host (maybe reallocate in host)
234 ref toHost(M, T)(ref M m, scope ref T[] host) if (isDeviceMemory!M) {
235     host.length = m.length;
236     checkCudaErrors(cuMemcpyDtoH(host.ptr, m.ptr, CudaElementType!M.sizeof * m.length));
237     return host;
238 }
239 
240 /// copy device memory to host (CAUTION: no reallocation here)
241 auto toHost(M, T)(ref M m, T* host) if (isDeviceMemory!M) {
242     checkCudaErrors(cuMemcpyDtoH(host, m.ptr, CudaElementType!M.sizeof * m.length));
243     return host;
244 }
245 
246 /// allocate host memory and copy device memory content
247 auto toHost(M)(ref M m) if (isDeviceMemory!M) {
248     alias T = CudaElementType!M;
249     auto host = new T[m.length];
250     checkCudaErrors(cuMemcpyDtoH(host.ptr, m.ptr, T.sizeof * m.length));
251     return host;
252 }
253 
254 ///
255 unittest {
256     foreach (i; 0 .. 100) {
257         auto d = CuPtr!float([3.0]);
258         assert(d.toHost() == [3.0]);
259     }
260 }
261 
262 
263 /// fat pointer in CUDA
264 struct CuPtr(T) {
265     CUdeviceptr ptr = 0;
266     const size_t length = 0;
267 
268     /// create copy of host array into device
269     this(T[] host) {
270         this(host.length);
271         checkCudaErrors(cuMemcpyHtoD(ptr, &host[0], T.sizeof * length));
272     }
273 
274     @disable this(this); // not copyable
275     @disable new(size_t); // not allocatable on heap
276 
277     /// create uninitialized T.sizeof * n array in device
278     this(size_t n) {
279         this.length = n;
280         if (n > 0) {
281             checkCudaErrors(cuMemAlloc(&this.ptr, T.sizeof * this.length));
282         }
283     }
284 
285     /// create fat pointer from raw pointer and its length
286     this(CUdeviceptr p, size_t l) {
287         this.ptr = p;
288         this.length = l;
289     }
290 
291     /// dtor calling cuMemFree
292     ~this() {
293         if (ptr != 0x0) checkCudaErrors(cuMemFree(ptr));
294         ptr = 0x0;
295     }
296 
297     inout T* data() {
298         return cast(T*) this.ptr;
299     }
300 }
301 
302 
303 /// duplicate cuda memory (deep copy)
304 auto dup(M)(ref M m) if (isDeviceMemory!M) {
305     CUdeviceptr ret;
306     alias T = CudaElementType!M;
307     if (m.length > 0) {
308         checkCudaErrors(cuMemAlloc(&ret, T.sizeof * m.length));
309         checkCudaErrors(cuMemcpyDtoD(ret, m.ptr, T.sizeof * m.length));
310     }
311     return CuPtr!T(ret, m.length);
312 }
313 
314 
315 /// sub-region on CuPtr!T
316 struct CuArray(T) {
317     import std.typecons : RefCounted;
318     const RefCounted!(CuPtr!T) storage;
319     CUdeviceptr ptr;
320     alias ptr this;
321 
322     ///
323     this(CuPtr!T storage) {
324         import std.algorithm : move;
325         this.storage = move(storage);
326         this.ptr = this.storage.ptr;
327     }
328 
329     ///
330     this(CuPtr!T storage, size_t offset) {
331         import std.algorithm : move;
332         this.storage = move(storage);
333         this.ptr = this.storage.ptr + offset;
334     }
335 
336     ///
337     this(RefCounted!(CuPtr!T) storage, size_t offset=0) {
338         this.storage = storage;
339         this.ptr = this.storage.ptr + offset;
340     }
341 
342     ///
343     this(T[] host) {
344         this.storage = CuPtr!T(host);
345         this.ptr = this.storage.ptr;
346     }
347 
348     /// not allocatable on heap
349     @disable new(size_t);
350 
351     /// create uninitialized T.sizeof * n array in device
352     this(size_t n) {
353         this.storage = CuPtr!T(n);
354         this.ptr = this.storage.ptr;
355     }
356 
357     ///
358     @property
359     const length() {
360         if (this.ptr == 0) return 0;
361         auto end = this.storage.ptr + this.storage.length;
362         assert(end >= this.ptr);
363         return end - this.ptr;
364     }
365 }
366 
367 
368 /// deep copy inter device memory without allocation
369 void copy(T)(ref CuPtr!T src, ref CuPtr!T dst)
370     in { assert(src.length == dst.length); }
371 do {
372     checkCudaErrors(cuMemcpyDtoD(dst.ptr, src.ptr, T.sizeof * src.length));
373 }
374 
375 /// fill value for N elements from the first position
376 /// TODO use cudnnSetTensor
377 ref fill_(S, V)(ref S storage, V v, size_t N) if (isDeviceMemory!S) {
378     alias T = CudaElementType!S;
379     auto value = T(v);
380     import std.traits : isFloatingPoint;
381     // static if (isFloatingPoint!T) {
382     //     import derelict.cudnn7;
383     //     import grain.cudnn : fill, makeCudnnTensor;
384     //     auto a = S(N);
385     //     checkCUDNN( cudnnSetTensor(cudnnHandle, a.makeCudnnTensor, cast(void*) a.ptr, cast(const void*) &value) );
386     // } else {
387     import std.conv : to;
388     import std.traits : Parameters;
389     mixin("alias _memset = cuMemsetD" ~  to!string(T.sizeof * 8) ~ ";");
390     alias Bytes = Parameters!(_memset)[1];
391     static assert(Bytes.sizeof == T.sizeof);
392     _memset(storage.ptr, *(cast(Bytes*) &value), N);
393     //}
394     return storage;
395 }
396 
397 /// fill value for all the element in device array
398 ref fill_(S, V)(ref S storage, V value) if (isDeviceMemory!S) {
399     return fill_(storage, value, storage.length);
400 }
401 
402 /// fill zero for all the element in device array
403 ref zero_(S)(ref S storage) if (isDeviceMemory!S) {
404     return fill_(storage, CudaElementType!S(0));
405 }
406 
407 
408 /// create zero filled N elements array
409 auto zeros(S)(size_t N) if (isDeviceMemory!S) {
410     import std.algorithm : move;
411     auto a = CuPtr!(CudaElementType!S)(N);
412     a.zero_();
413     return move(a);
414 }
415 
416 
417 /// cuda error checker
418 void checkCudaErrors(string file = __FILE__, size_t line = __LINE__,
419                      string mod = __MODULE__, string func = __FUNCTION__)(CUresult err) {
420     import std.format;
421     const(char)* name, content;
422     cuGetErrorName(err, &name);
423     cuGetErrorString(err, &content);
424     assert(err == CUDA_SUCCESS,
425            format!"%s: %s from %s @%s:%s"(
426                name.fromStringz,  content.fromStringz,
427                func, file, line));
428 }
429 
430 /// cublas error checker
431 void checkCublasErrors(cublasStatus_t err) {
432     assert(err == CUBLAS_STATUS_SUCCESS, cublasGetErrorEnum(err));
433 }
434 
435 /// cudnn error checker
436 void checkCUDNN(string file = __FILE__, size_t line = __LINE__)(cudnnStatus_t err) {
437     import std.conv : to;
438     import std.format : format;
439     assert(err == CUDNN_STATUS_SUCCESS, cudnnGetErrorString(err).fromStringz ~ format!" at %s (%d)"(file, line));
440 }
441 
442 /// example to launch kernel
443 unittest {
444     import grain.kernel; // : saxpy;
445 
446     // Populate input
447     uint n = 16;
448     auto hostA = new float[n];
449     auto hostB = new float[n];
450     auto hostC = new float[n];
451     foreach (i; 0 .. n) {
452         hostA[i] = i;
453         hostB[i] = 2 * i;
454         hostC[i] = 0;
455     }
456 
457     // Device data
458     auto devA = CuPtr!float(hostA);
459     auto devB = CuPtr!float(hostB);
460     auto devC = CuPtr!float(n);
461 
462     // Kernel launch
463     Global.kernel!(saxpy).call(devC.ptr, devA.ptr, devB.ptr, n).launch(n);
464 
465     // Validation
466     devC.toHost(hostC);
467     foreach (i; 0 .. n) {
468         // writefln!"%f + %f = %f"(hostA[i], hostB[i], hostC[i]);
469         assert(hostA[i] + hostB[i] == hostC[i]);
470     }
471 }
472 
473 
474 float sumNaive(S)(ref S a) if (isDeviceMemory!S) {
475     import grain.kernel : sum;
476     auto b = CuPtr!float([0]);
477     auto N = cast(int) a.length;
478     Global.kernel!sum.call(a.ptr, b.ptr, N)
479         .launch(cast(uint[3]) [1U,1,1], cast(uint[3]) [1U,1,1], 0U);
480     checkCudaErrors(cuCtxSynchronize());
481     return b.toHost[0];
482 }
483 
484 unittest {
485     auto a = CuPtr!float([3, 4, 5]);
486     assert(a.sumNaive == 3+4+5);
487 }
488 
489 
490 
491 extern (C++) float sum_thrust(float*, uint n);
492 
493 /// test sum
494 float sum(S)(ref S a) if (isDeviceMemory!S) {
495     return sum_thrust(cast(float*) a.ptr, cast(uint) a.length);
496 }
497 
498 unittest {
499     auto a = CuPtr!float([2, 4, 5, 6]);
500     auto b = sum(a);
501     assert(b == 2+4+5+6);
502 }
503 
504 /*
505 // test cxx kernel
506 unittest {
507     auto a = CuPtr!float([3, 4, 5]);
508     auto b = CuPtr!float([0]);
509     auto N = cast(int) a.length;
510     assert(N == 3);
511     Global.cxxKernel("sum_naive", a.ptr, b.ptr, N)
512         .launch(cast(uint[3]) [1U,1,1], cast(uint[3]) [1U,1,1], 0U);
513     // checkCudaErrors(cuCtxSynchronize());
514     writeln(b.toHost());
515     assert(b.toHost()[0] == 3+4+5);
516 }
517 */
518 
519 /// example to fill value
520 unittest {
521     auto d = CuPtr!float(3);
522     d.zero_();
523     auto h = d.toHost();
524     assert(h == [0, 0, 0]);
525     // assert(zeros!(CuPtr!float)(3).toHost() == [0, 0, 0]);
526     assert(d.fill_(3).toHost() == [3, 3, 3]);
527 }
528 
529 
530 /// high-level axpy (y = alpha * x + y) wrapper for CuPtr
531 void axpy(T)(const ref CuArray!T x, ref CuArray!T y, T alpha=1, int incx=1, int incy=1)  {
532     static if (is(T == float)) {
533         alias axpy_ = cublasSaxpy_v2;
534     } else static if (is(T == double)) {
535         alias axpy_ = cublasDaxpy_v2;
536     } else {
537         static assert(false, "unsupported type: " ~ T.stringof);
538     }
539     auto status = axpy_(cublasHandle, cast(int) x.length, &alpha,
540                         cast(const T*) x.ptr, incx,
541                         cast(T*) y.ptr, incy);
542     assert(status == CUBLAS_STATUS_SUCCESS, cublasGetErrorEnum(status));
543 }
544 
545 /// cublas tests
546 unittest {
547     auto a = CuArray!float([3, 4, 5]);
548     auto b = CuArray!float([1, 2, 3]);
549     axpy(a, b, 2.0);
550     assert(a.toHost() == [3, 4, 5]);
551     assert(b.toHost() == [7, 10, 13]);
552 }