1 /** 2 CUDA wrapper module 3 */ 4 module grain.cuda; 5 version(grain_cuda): 6 7 import std.traits : ReturnType, arity; 8 import std.typecons : RefCounted; 9 import std.stdio : writeln, writefln; 10 import std..string : toStringz, fromStringz; 11 12 import derelict.cuda; 13 import derelict.cudnn7; 14 import grain.cublas; 15 import grain.utility; 16 17 18 19 20 // TODO: support multiple GPU devices (context) 21 __gshared CUcontext context; 22 __gshared cublasHandle_t cublasHandle; 23 __gshared cudnnHandle_t cudnnHandle; 24 25 /// global cuda init 26 shared static this() { 27 // Initialize the driver API 28 DerelictCUDADriver.load(); 29 CUdevice device; 30 cuInit(0); 31 // Get a handle to the first compute device 32 cuDeviceGet(&device, 0); 33 // Create a compute device context 34 cuCtxCreate(&context, 0, device); 35 36 37 // init CUDA libraries 38 checkCublasErrors(cublasCreate_v2(&cublasHandle)); 39 DerelictCuDNN7.load(); 40 checkCUDNN( cudnnCreate(&cudnnHandle) ); 41 } 42 43 /// global cuda exit 44 shared static ~this() { 45 import core.memory : GC; 46 GC.collect(); 47 cublasDestroy_v2(cublasHandle); 48 checkCUDNN( cudnnDestroy(cudnnHandle) ); 49 checkCudaErrors(cuCtxDestroy(context)); 50 } 51 52 /// cuda module compiled from ptx string 53 struct CuModule { 54 CUmodule cuModule; 55 56 /// 57 this(string ptxstr) { 58 // JIT compile a null-terminated PTX string 59 checkCudaErrors(cuModuleLoadData(&cuModule, cast(void*) ptxstr.toStringz)); 60 } 61 62 /// 63 ~this() { 64 checkCudaErrors(cuModuleUnload(cuModule)); 65 } 66 67 /// 68 auto kernel(alias F)() { 69 return Kernel!F(cuModule); 70 } 71 } 72 73 /// global accessor for the cuda module in grain 74 class Global { 75 import K = grain.kernel; 76 private this() {} 77 78 // Cache instantiation flag in thread-local bool 79 // Thread local 80 private static bool instantiated_ = false, cxxInstantiated_ = false; 81 82 // Thread global 83 private __gshared CuModule* module_, cxxModule_; 84 85 /// 86 static get() { 87 if (!instantiated_) { 88 synchronized(Global.classinfo) { 89 module_ = new CuModule(K.cxxptx); 90 instantiated_ = true; 91 } 92 } 93 return module_; 94 } 95 96 /// 97 static getCxx() { 98 if (!cxxInstantiated_) { 99 synchronized(Global.classinfo) { 100 cxxModule_ = new CuModule(K.cxxptx); 101 cxxInstantiated_ = true; 102 } 103 } 104 return cxxModule_; 105 } 106 107 /// 108 static cxxKernel(T...)(string name, T args) { 109 CUfunction cuFunction; 110 writeln("getFunction..."); 111 checkCudaErrors(cuModuleGetFunction(&cuFunction, getCxx(), name.toStringz)); 112 writeln("getFunction..."); 113 return Launcher!T(cuFunction, args); 114 } 115 116 /// 117 static kernel(alias F)() { 118 return get().kernel!F; 119 } 120 } 121 122 /// ditto 123 auto global() { 124 return Global.get(); 125 } 126 127 // pthread error ? 128 // auto CUDA_POST_KERNEL_CHECK() { 129 // checkCudaErrors(cudaPeekAtLastError()); 130 // } 131 132 /// cuda kernel function launcher with runtime numbers of blocks/threads 133 struct Launcher(Args...) { 134 CUfunction cuFunction; 135 Args args; 136 137 /// create kernel function as void[Args.length] 138 auto kernelParams(T...)(T args) { 139 void*[args.length] ret; 140 foreach (i, a; args) { 141 ret[i] = &a; 142 } 143 return ret; 144 } 145 146 /// detailed launch function 147 void launch(uint[3] grid, uint[3] block, uint sharedMemBytes=0, CUstream stream=null) { 148 checkCudaErrors(cuLaunchKernel( 149 cuFunction, 150 grid[0], grid[1], grid[2], 151 block[0], block[1], block[2], 152 sharedMemBytes, stream, 153 kernelParams(args).ptr, null)); 154 // CUDA_POST_KERNEL_CHECK(); 155 } 156 157 // TODO __CUDA_ARCH__ < 200 512 158 enum CUDA_NUM_THREADS = 1024; 159 160 static getBlocks(uint n) { 161 return (n + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 162 } 163 164 /// convinient launch function 165 void launch(uint n=1, uint sharedMemBytes=0, CUstream stream=null) { 166 checkCudaErrors(cuLaunchKernel( 167 cuFunction, 168 getBlocks(n), 1, 1, 169 CUDA_NUM_THREADS, 1, 1, 170 sharedMemBytes, stream, 171 kernelParams(args).ptr, null)); 172 // CUDA_POST_KERNEL_CHECK(); 173 } 174 } 175 176 177 /// cuda function object called by mangled name of C++/D device function F 178 struct Kernel(alias F) if (is(ReturnType!F == void)) { 179 // enum name = __traits(identifier, F); 180 enum name = F.mangleof; 181 CUfunction cuFunction; 182 183 /// 184 this(CUmodule m) { 185 // writeln("mangled: ", name); 186 checkCudaErrors(cuModuleGetFunction(&cuFunction, m, name.toStringz)); 187 } 188 189 // TODO: compile-time type check like d-nv 190 // TODO: separate this to struct Launcher 191 auto call(T...)(T args) { 192 static assert(args.length == arity!F); 193 // Kernel launch 194 // checkCudaErrors(cuCtxSynchronize()); 195 return Launcher!T(cuFunction, args); 196 } 197 } 198 199 /// alias to element type of cuda storage 200 alias CudaElementType(M : CuPtr!T, T) = T; 201 /// ditto 202 alias CudaElementType(M : CuArray!T, T) = T; 203 /// ditto 204 alias CudaElementType(M : RefCounted!(CuPtr!T), T) = T; 205 /// ditto 206 alias CudaElementType(M : RefCounted!(CuArray!T), T) = T; 207 208 /// trait to identify cuda storage 209 enum bool isDeviceMemory(T) = is(typeof({ 210 static assert(is(typeof(T.init.ptr) == CUdeviceptr)); 211 static assert(is(typeof(T.init.length) == const(size_t))); 212 })); 213 214 /// 215 unittest { 216 static assert(is(typeof(CuArray!float.init.ptr) == CUdeviceptr)); 217 static assert(is(typeof(CuArray!float.init.length) == const(size_t))); 218 static assert(isDeviceMemory!(CuPtr!float)); 219 static assert(isDeviceMemory!(CuArray!float)); 220 static assert(isDeviceMemory!(RefCounted!(CuPtr!float))); 221 static assert(isDeviceMemory!(RefCounted!(CuArray!float))); 222 static assert(is(CudaElementType!(CuPtr!float) == float)); 223 static assert(is(CudaElementType!(CuArray!float) == float)); 224 static assert(is(CudaElementType!(RefCounted!(CuPtr!float)) == float)); 225 static assert(is(CudaElementType!(RefCounted!(CuArray!float)) == float)); 226 } 227 228 /// true if length == 0 229 bool empty(M)(M m) if (isDeviceMemory!M) { 230 return m.length == 0; 231 } 232 233 /// copy device memory to host (maybe reallocate in host) 234 ref toHost(M, T)(ref M m, scope ref T[] host) if (isDeviceMemory!M) { 235 host.length = m.length; 236 checkCudaErrors(cuMemcpyDtoH(host.ptr, m.ptr, CudaElementType!M.sizeof * m.length)); 237 return host; 238 } 239 240 /// copy device memory to host (CAUTION: no reallocation here) 241 auto toHost(M, T)(ref M m, T* host) if (isDeviceMemory!M) { 242 checkCudaErrors(cuMemcpyDtoH(host, m.ptr, CudaElementType!M.sizeof * m.length)); 243 return host; 244 } 245 246 /// allocate host memory and copy device memory content 247 auto toHost(M)(ref M m) if (isDeviceMemory!M) { 248 alias T = CudaElementType!M; 249 auto host = new T[m.length]; 250 checkCudaErrors(cuMemcpyDtoH(host.ptr, m.ptr, T.sizeof * m.length)); 251 return host; 252 } 253 254 /// 255 unittest { 256 foreach (i; 0 .. 100) { 257 auto d = CuPtr!float([3.0]); 258 assert(d.toHost() == [3.0]); 259 } 260 } 261 262 263 /// fat pointer in CUDA 264 struct CuPtr(T) { 265 CUdeviceptr ptr = 0; 266 const size_t length = 0; 267 268 /// create copy of host array into device 269 this(T[] host) { 270 this(host.length); 271 checkCudaErrors(cuMemcpyHtoD(ptr, &host[0], T.sizeof * length)); 272 } 273 274 @disable this(this); // not copyable 275 @disable new(size_t); // not allocatable on heap 276 277 /// create uninitialized T.sizeof * n array in device 278 this(size_t n) { 279 this.length = n; 280 if (n > 0) { 281 checkCudaErrors(cuMemAlloc(&this.ptr, T.sizeof * this.length)); 282 } 283 } 284 285 /// create fat pointer from raw pointer and its length 286 this(CUdeviceptr p, size_t l) { 287 this.ptr = p; 288 this.length = l; 289 } 290 291 /// dtor calling cuMemFree 292 ~this() { 293 if (ptr != 0x0) checkCudaErrors(cuMemFree(ptr)); 294 ptr = 0x0; 295 } 296 297 inout T* data() { 298 return cast(T*) this.ptr; 299 } 300 } 301 302 303 /// duplicate cuda memory (deep copy) 304 auto dup(M)(ref M m) if (isDeviceMemory!M) { 305 CUdeviceptr ret; 306 alias T = CudaElementType!M; 307 if (m.length > 0) { 308 checkCudaErrors(cuMemAlloc(&ret, T.sizeof * m.length)); 309 checkCudaErrors(cuMemcpyDtoD(ret, m.ptr, T.sizeof * m.length)); 310 } 311 return CuPtr!T(ret, m.length); 312 } 313 314 315 /// sub-region on CuPtr!T 316 struct CuArray(T) { 317 import std.typecons : RefCounted; 318 const RefCounted!(CuPtr!T) storage; 319 CUdeviceptr ptr; 320 alias ptr this; 321 322 /// 323 this(CuPtr!T storage) { 324 import std.algorithm : move; 325 this.storage = move(storage); 326 this.ptr = this.storage.ptr; 327 } 328 329 /// 330 this(CuPtr!T storage, size_t offset) { 331 import std.algorithm : move; 332 this.storage = move(storage); 333 this.ptr = this.storage.ptr + offset; 334 } 335 336 /// 337 this(RefCounted!(CuPtr!T) storage, size_t offset=0) { 338 this.storage = storage; 339 this.ptr = this.storage.ptr + offset; 340 } 341 342 /// 343 this(T[] host) { 344 this.storage = CuPtr!T(host); 345 this.ptr = this.storage.ptr; 346 } 347 348 /// not allocatable on heap 349 @disable new(size_t); 350 351 /// create uninitialized T.sizeof * n array in device 352 this(size_t n) { 353 this.storage = CuPtr!T(n); 354 this.ptr = this.storage.ptr; 355 } 356 357 /// 358 @property 359 const length() { 360 if (this.ptr == 0) return 0; 361 auto end = this.storage.ptr + this.storage.length; 362 assert(end >= this.ptr); 363 return end - this.ptr; 364 } 365 } 366 367 368 /// deep copy inter device memory without allocation 369 void copy(T)(ref CuPtr!T src, ref CuPtr!T dst) 370 in { assert(src.length == dst.length); } 371 do { 372 checkCudaErrors(cuMemcpyDtoD(dst.ptr, src.ptr, T.sizeof * src.length)); 373 } 374 375 /// fill value for N elements from the first position 376 /// TODO use cudnnSetTensor 377 ref fill_(S, V)(ref S storage, V v, size_t N) if (isDeviceMemory!S) { 378 alias T = CudaElementType!S; 379 auto value = T(v); 380 import std.traits : isFloatingPoint; 381 // static if (isFloatingPoint!T) { 382 // import derelict.cudnn7; 383 // import grain.cudnn : fill, makeCudnnTensor; 384 // auto a = S(N); 385 // checkCUDNN( cudnnSetTensor(cudnnHandle, a.makeCudnnTensor, cast(void*) a.ptr, cast(const void*) &value) ); 386 // } else { 387 import std.conv : to; 388 import std.traits : Parameters; 389 mixin("alias _memset = cuMemsetD" ~ to!string(T.sizeof * 8) ~ ";"); 390 alias Bytes = Parameters!(_memset)[1]; 391 static assert(Bytes.sizeof == T.sizeof); 392 _memset(storage.ptr, *(cast(Bytes*) &value), N); 393 //} 394 return storage; 395 } 396 397 /// fill value for all the element in device array 398 ref fill_(S, V)(ref S storage, V value) if (isDeviceMemory!S) { 399 return fill_(storage, value, storage.length); 400 } 401 402 /// fill zero for all the element in device array 403 ref zero_(S)(ref S storage) if (isDeviceMemory!S) { 404 return fill_(storage, CudaElementType!S(0)); 405 } 406 407 408 /// create zero filled N elements array 409 auto zeros(S)(size_t N) if (isDeviceMemory!S) { 410 import std.algorithm : move; 411 auto a = CuPtr!(CudaElementType!S)(N); 412 a.zero_(); 413 return move(a); 414 } 415 416 417 /// cuda error checker 418 void checkCudaErrors(string file = __FILE__, size_t line = __LINE__, 419 string mod = __MODULE__, string func = __FUNCTION__)(CUresult err) { 420 import std.format; 421 const(char)* name, content; 422 cuGetErrorName(err, &name); 423 cuGetErrorString(err, &content); 424 assert(err == CUDA_SUCCESS, 425 format!"%s: %s from %s @%s:%s"( 426 name.fromStringz, content.fromStringz, 427 func, file, line)); 428 } 429 430 /// cublas error checker 431 void checkCublasErrors(cublasStatus_t err) { 432 assert(err == CUBLAS_STATUS_SUCCESS, cublasGetErrorEnum(err)); 433 } 434 435 /// cudnn error checker 436 void checkCUDNN(string file = __FILE__, size_t line = __LINE__)(cudnnStatus_t err) { 437 import std.conv : to; 438 import std.format : format; 439 assert(err == CUDNN_STATUS_SUCCESS, cudnnGetErrorString(err).fromStringz ~ format!" at %s (%d)"(file, line)); 440 } 441 442 /// example to launch kernel 443 unittest { 444 import grain.kernel; // : saxpy; 445 446 // Populate input 447 uint n = 16; 448 auto hostA = new float[n]; 449 auto hostB = new float[n]; 450 auto hostC = new float[n]; 451 foreach (i; 0 .. n) { 452 hostA[i] = i; 453 hostB[i] = 2 * i; 454 hostC[i] = 0; 455 } 456 457 // Device data 458 auto devA = CuPtr!float(hostA); 459 auto devB = CuPtr!float(hostB); 460 auto devC = CuPtr!float(n); 461 462 // Kernel launch 463 Global.kernel!(saxpy).call(devC.ptr, devA.ptr, devB.ptr, n).launch(n); 464 465 // Validation 466 devC.toHost(hostC); 467 foreach (i; 0 .. n) { 468 // writefln!"%f + %f = %f"(hostA[i], hostB[i], hostC[i]); 469 assert(hostA[i] + hostB[i] == hostC[i]); 470 } 471 } 472 473 474 float sumNaive(S)(ref S a) if (isDeviceMemory!S) { 475 import grain.kernel : sum; 476 auto b = CuPtr!float([0]); 477 auto N = cast(int) a.length; 478 Global.kernel!sum.call(a.ptr, b.ptr, N) 479 .launch(cast(uint[3]) [1U,1,1], cast(uint[3]) [1U,1,1], 0U); 480 checkCudaErrors(cuCtxSynchronize()); 481 return b.toHost[0]; 482 } 483 484 unittest { 485 auto a = CuPtr!float([3, 4, 5]); 486 assert(a.sumNaive == 3+4+5); 487 } 488 489 490 491 extern (C++) float sum_thrust(float*, uint n); 492 493 /// test sum 494 float sum(S)(ref S a) if (isDeviceMemory!S) { 495 return sum_thrust(cast(float*) a.ptr, cast(uint) a.length); 496 } 497 498 unittest { 499 auto a = CuPtr!float([2, 4, 5, 6]); 500 auto b = sum(a); 501 assert(b == 2+4+5+6); 502 } 503 504 /* 505 // test cxx kernel 506 unittest { 507 auto a = CuPtr!float([3, 4, 5]); 508 auto b = CuPtr!float([0]); 509 auto N = cast(int) a.length; 510 assert(N == 3); 511 Global.cxxKernel("sum_naive", a.ptr, b.ptr, N) 512 .launch(cast(uint[3]) [1U,1,1], cast(uint[3]) [1U,1,1], 0U); 513 // checkCudaErrors(cuCtxSynchronize()); 514 writeln(b.toHost()); 515 assert(b.toHost()[0] == 3+4+5); 516 } 517 */ 518 519 /// example to fill value 520 unittest { 521 auto d = CuPtr!float(3); 522 d.zero_(); 523 auto h = d.toHost(); 524 assert(h == [0, 0, 0]); 525 // assert(zeros!(CuPtr!float)(3).toHost() == [0, 0, 0]); 526 assert(d.fill_(3).toHost() == [3, 3, 3]); 527 } 528 529 530 /// high-level axpy (y = alpha * x + y) wrapper for CuPtr 531 void axpy(T)(const ref CuArray!T x, ref CuArray!T y, T alpha=1, int incx=1, int incy=1) { 532 static if (is(T == float)) { 533 alias axpy_ = cublasSaxpy_v2; 534 } else static if (is(T == double)) { 535 alias axpy_ = cublasDaxpy_v2; 536 } else { 537 static assert(false, "unsupported type: " ~ T.stringof); 538 } 539 auto status = axpy_(cublasHandle, cast(int) x.length, &alpha, 540 cast(const T*) x.ptr, incx, 541 cast(T*) y.ptr, incy); 542 assert(status == CUBLAS_STATUS_SUCCESS, cublasGetErrorEnum(status)); 543 } 544 545 /// cublas tests 546 unittest { 547 auto a = CuArray!float([3, 4, 5]); 548 auto b = CuArray!float([1, 2, 3]); 549 axpy(a, b, 2.0); 550 assert(a.toHost() == [3, 4, 5]); 551 assert(b.toHost() == [7, 10, 13]); 552 }