1 /++ 2 Chain means autograd operators in grain that is equivalent to 3 - pytorch: torch.nn.Module 4 - chainer: chainer.Chain or chainer.Link 5 6 Users cannot apply grain.functions to Variable without new or applyForward. 7 Instead of that, you can apply grain.chains to Variable with opCall. 8 9 TODO test chains as functions 10 +/ 11 module grain.chain; 12 13 import std.traits : isFloatingPoint; 14 import numir : normal; 15 16 import grain.autograd; // : Variable, variable, to; 17 version (grain_cuda) { 18 import grain.cuda : zero_; 19 } 20 21 // enum isChain(T) = { 22 // import std.traits; 23 // import std.meta; 24 // alias R = ReturnType!(T.init); 25 // if (isVariable!R) return true; 26 // if (isTuple!() AllSatisfy!(isVariable, ReturnType!(T.init)); 27 // }(); 28 29 //////// Unary functions 30 31 /// rectified linear unit nonlinearity 32 auto relu(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 33 import grain.functions : ReLU; 34 35 auto func = new ReLU!(T, dim); 36 return func.applyForward(x); 37 } 38 39 /// sigmoid nonlinearity 40 auto sigmoid(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 41 import grain.functions : Sigmoid; 42 43 auto func = new Sigmoid!(T, dim); 44 return func.applyForward(x); 45 } 46 47 /// tanh nonlinearity 48 auto tanh(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 49 import grain.functions : Tanh; 50 51 auto func = new Tanh!(T, dim); 52 return func.applyForward(x); 53 } 54 55 /// 1 / x 56 auto reciprocal(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 57 import grain.functions : Reciprocal; 58 59 auto func = new Reciprocal!(T, dim); 60 return func.applyForward(x); 61 } 62 63 /// -x 64 auto neg(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 65 import grain.functions : Neg; 66 67 auto func = new Neg!(T, dim); 68 return func.applyForward(x); 69 } 70 71 /// exp x 72 auto exp(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 73 import grain.functions.unary : Exp; 74 75 auto func = new Exp!(T, dim); 76 return func.applyForward(x); 77 } 78 79 /// log x 80 auto log(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 81 import grain.functions.unary : Log; 82 83 auto func = new Log!(T, dim); 84 return func.applyForward(x); 85 } 86 87 /// sin x 88 auto sin(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 89 import grain.functions.unary : Sin; 90 91 auto func = new Sin!(T, dim); 92 return func.applyForward(x); 93 } 94 95 /// cos x 96 auto cos(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 97 import grain.functions.unary : Cos; 98 99 auto func = new Cos!(T, dim); 100 return func.applyForward(x); 101 } 102 103 /// tan x 104 auto tan(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 105 import grain.functions.unary : Tan; 106 107 auto func = new Tan!(T, dim); 108 return func.applyForward(x); 109 } 110 111 /// abs 112 auto abs(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 113 import grain.functions.unary : Abs; 114 115 auto func = new Abs!(T, dim); 116 return func.applyForward(x); 117 } 118 119 /// pow 120 auto pow(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x, T power) { 121 import grain.functions.unary : Pow; 122 123 auto func = new Pow!(T, dim)(power); 124 return func.applyForward(x); 125 } 126 127 /// log exp(x_i) / sum_i (exp(x_i)) 128 auto logSoftmax(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 129 import grain.functions.unary : LogSoftmax; 130 131 auto func = new LogSoftmax!(T, dim); 132 return func.applyForward(x); 133 } 134 135 /// test fast math functions 136 unittest { 137 import grain.testing; 138 import numir; 139 import mir.ndslice; 140 import std.meta; 141 142 foreach (f; AliasSeq!(sigmoid, tanh, reciprocal, neg, exp, log, sin, cos, 143 tan, x => pow(x, 2.0f), logSoftmax)) { 144 auto hx = uniform!float(2, 3).slice.variable(true); 145 auto hgy = uniform!float(2, 3).slice.variable; 146 gradCheckChain!f(hx, hgy, 1e-3, 5e-2, 5e-2); 147 } 148 } 149 150 /////// Loss 151 152 /// negative loglikelihood - log p(x). note that p(x) should be normalized 153 auto negativeLogLikelihood(alias Storage)(Variable!(float, 2, Storage) x, 154 Variable!(int, 1, Storage) t, int ignoreIndex = -100) { 155 import grain.functions : NegativeLogLikelihood; 156 157 auto nll = new NegativeLogLikelihood!(float, int); 158 nll.ignoreIndex = ignoreIndex; 159 return nll.applyForward(x, t); 160 } 161 162 /// cross entropy loss (logsoftmax -> negative loglikelihood function) 163 auto crossEntropy(alias Storage)(Variable!(float, 2, Storage) x, 164 Variable!(int, 1, Storage) t, int ignoreIndex = -100) { 165 import grain.functions : NegativeLogLikelihood; 166 167 auto y = logSoftmax(x); 168 return negativeLogLikelihood(y, t, ignoreIndex); 169 } 170 171 /// test variable.backward 172 unittest { 173 /* pytorch equivalent 174 >>> x = torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]], requires_grad=True) 175 >>> t = torch.tensor([1, 0, -100], dtype=torch.long) 176 >>> l = torch.nn.functional.cross_entropy(x, t) 177 >>> l 178 tensor(0.6944) 179 >>> l.backward() 180 >>> x.grad 181 tensor([[ 0.2375, -0.2375], 182 [-0.2625, 0.2625], 183 [ 0.0000, 0.0000]]) 184 */ 185 import std.stdio; 186 import std.typecons; 187 import mir.ndslice; 188 import grain.autograd; 189 import numir; 190 static import grain.config; 191 grain.config.backprop = true; 192 193 auto hx = [[0.1f, 0.2f], [0.3f, 0.4f], [0.5f, 0.6f]].variable(true); 194 auto ht = [1, 0, -100].variable; 195 auto hl = crossEntropy(hx, ht); 196 hl.backward(); 197 assert(approxEqual(hx.gradSliced, [[0.2375, -0.2375], [-0.2625, 0.2625], 198 [0.0000, 0.0000]].nparray)); 199 200 version (grain_cuda) { 201 auto dx = hx.to!DeviceStorage; 202 dx.grad.zero_(); 203 auto dt = ht.to!DeviceStorage; 204 auto dl = crossEntropy(dx, dt); 205 assert(approxEqual(hl.sliced, dl.to!HostStorage.sliced)); 206 dl.backward(); 207 assert(approxEqual(dx.to!HostStorage.gradSliced, hx.gradSliced)); 208 } 209 } 210 211 /// Binary functions 212 213 /** 214 op(alpha1 * a, alpha2 * b) 215 216 this function is tested under grain.autograd.Variable.opBinary 217 */ 218 auto opBinaryFunc(string op, T, size_t dim, alias Storage)(Variable!(T, dim, 219 Storage) a, Variable!(T, dim, Storage) b, T alpha1 = 1, T alpha2 = 1) { 220 import grain.functions : OpBinary; 221 222 auto func = new OpBinary!(T, dim, op)(alpha1, alpha2); 223 return func.applyForward(a, b); 224 } 225 226 /// matrix x matrix multiplication 227 auto matMul(T, alias Storage)(Variable!(T, 2, Storage) a, Variable!(T, 2, Storage) b) { 228 import grain.functions : MatMul; 229 230 auto func = new MatMul!T; 231 return func.applyForward(a, b); 232 } 233 234 /// matrix + vector row-wise addition. TODO replace this with broadcasted addition 235 auto addVec(T, alias Storage)(Variable!(T, 2, Storage) a, Variable!(T, 1, Storage) b) { 236 import grain.functions : AddBias; 237 238 auto func = new AddBias!T; 239 return func.applyForward(a, b); 240 } 241 242 /// tensor convolution (do cross correlation default) 243 auto convolution(bool isConv = false, bool isNchw = true, T, size_t dim, size_t imDim, 244 alias Storage)(Variable!(T, dim, Storage) x, Variable!(T, dim, 245 Storage) w, int[imDim] stride, int[imDim] pad, int[imDim] dilation) { 246 static assert(dim == imDim + 2); 247 import grain.functions : Convolution; 248 249 static assert(dim > 2); // TODO support 1d, 2d inputs 250 auto func = new Convolution!(T, dim - 2, isConv, isNchw); 251 func.stride = stride; 252 func.pad = pad; 253 func.dilation = dilation; 254 return func.applyForward(x, w); 255 } 256 257 ////// Parametric chains 258 259 /// convolution operator 260 struct Convolution(T, size_t dim, alias Storage) { 261 import grain.utility : castArray; 262 import mir.ndslice : slice; 263 264 Variable!(T, dim + 2, Storage) weight; 265 Variable!(T, dim + 2, Storage) bias; // TODO implement unsqueeze 266 int nInput, nOutput; 267 bool useBias = true; 268 int[dim] kernel, stride, pad, dilation; 269 270 auto outShape(uint[dim + 2] inShape) { 271 import grain.functions; 272 273 auto func = grain.functions.Convolution!(T, dim)(this.stride, this.pad, this 274 .dilation); 275 return func.outShape(inShape, this.weight.shape); 276 } 277 278 /// 279 this(int nInput, int nOutput, int[dim] kernel, bool useBias = true) { 280 this.nInput = nInput; 281 this.nOutput = nOutput; 282 this.kernel = kernel; 283 this.stride[] = 1; 284 this.pad[] = 0; 285 this.dilation[] = 1; 286 this.useBias = useBias; 287 this.resetParameters(); 288 } 289 290 /// 291 this(int nInput, int nOutput, int[dim] kernel, int[dim] stride, int[dim] pad, 292 int[dim] dilation, bool useBias = true) { 293 this.nInput = nInput; 294 this.nOutput = nOutput; 295 this.kernel = kernel; 296 this.stride = stride; 297 this.pad = pad; 298 this.dilation = dilation; 299 this.useBias = useBias; 300 this.resetParameters(); 301 } 302 303 /// pytorch style init (LeCun uniform init) 304 void resetParameters() { 305 import std.algorithm : reduce; 306 import numir : generate; 307 import mir.random.variable : UniformVariable; 308 309 // TODO create initializer 310 const receptiveSize = this.kernel.reduce!"a * b"; 311 const fanIn = this.nInput * receptiveSize; 312 const fanOut = this.nOutput * receptiveSize; 313 auto stdv = 1.0 / (cast(T) fanIn ^^ 0.5); 314 int[dim + 2] wshape; 315 wshape[0] = this.nOutput; 316 wshape[1] = this.nInput; 317 wshape[2 .. $] = this.kernel; 318 this.weight = UniformVariable!T(-stdv, stdv) 319 .generate(wshape.castArray!size_t).slice.variable(true).to!Storage; 320 if (this.useBias) { 321 int[dim + 2] bshape; 322 bshape[] = 1; 323 bshape[1] = this.nOutput; 324 this.bias = UniformVariable!T(-stdv, stdv).generate( 325 bshape.castArray!size_t).slice.variable(true).to!Storage; 326 } 327 } 328 329 /// 330 auto opCall(Variable!(T, dim + 2, Storage) x) { 331 auto wx = convolution(x, this.weight, this.stride, this.pad, this 332 .dilation); 333 if (this.useBias) { 334 return wx + this.bias; 335 } 336 else { 337 return wx; 338 } 339 } 340 } 341 342 /// 343 unittest { 344 import grain.testing; 345 import grain.utility; 346 import numir; 347 import mir.ndslice; 348 349 auto conv = Convolution!(float, 2, HostStorage)(3, 4, [3, 3]); 350 auto x = uniform!float(2, 3, 4, 4).slice.variable(true); 351 auto y = conv(x); 352 auto gy = uniform!float(y.shape.castArray!size_t).slice.variable; 353 gradCheckChain!conv(x, gy, 1e-3, 5e-2, 5e-2); 354 } 355 356 /// linear operator 357 struct Linear(T, alias Storage) if (isFloatingPoint!T) { 358 import mir.ndslice : slice; 359 360 Variable!(T, 2, Storage) weight; 361 Variable!(T, 1, Storage) bias; 362 int nInput, nOutput; 363 bool useBias = true; 364 365 this(int nInput, int nOutput, bool useBias = true) { 366 this.nInput = nInput; 367 this.nOutput = nOutput; 368 this.useBias = useBias; 369 this.resetParameters(); 370 } 371 372 // pytorch style init (LeCun uniform init) 373 void resetParameters() { 374 import numir : generate; 375 import mir.random.variable : UniformVariable; 376 377 auto stdv = 1.0 / (cast(T) this.nOutput ^^ 0.5); 378 this.weight = UniformVariable!T(-stdv, stdv).generate(this.nInput, 379 this.nOutput).slice.variable(true).to!Storage; 380 if (this.useBias) { 381 this.bias = UniformVariable!T(-stdv, stdv).generate(this.nOutput) 382 .slice.variable(true).to!Storage; 383 } 384 } 385 386 auto opCall(Variable!(T, 2, Storage) x) { 387 auto wx = matMul(x, this.weight); 388 if (this.useBias) { 389 return addVec(wx, this.bias); 390 } 391 else { 392 return wx; 393 } 394 } 395 } 396 397 /// 398 unittest { 399 import grain.testing; 400 import grain.utility; 401 import numir; 402 import mir.ndslice; 403 404 auto f = Linear!(float, HostStorage)(2, 3); 405 auto x = uniform!float(2, 2).slice.variable(true); 406 auto y = f(x); 407 auto gy = uniform!float(y.shape.castArray!size_t).slice.variable; 408 gradCheckChain!f(x, gy, 1e-3, 5e-2, 5e-2); 409 } 410 411 /// Emebedding ID into vector 412 struct Embedding(T, alias Storage) if (isFloatingPoint!T) { 413 import mir.ndslice : slice; 414 415 Variable!(T, 2, Storage) weight; 416 uint nVocab, nEmbed; 417 418 this(uint nVocab, uint nEmbed) { 419 this.nVocab = nVocab; 420 this.nEmbed = nEmbed; 421 this.resetParameters(); 422 } 423 424 void resetParameters() { 425 import numir : normal; 426 this.weight = normal!T(this.nVocab, this.nEmbed).slice.variable(true).to!Storage; 427 } 428 429 auto opCall(Variable!(int, 1, Storage) ids) { 430 import grain.functions; 431 432 auto func = new grain.functions.Embedding!T; 433 return func.applyForward(this.weight, ids); 434 } 435 } 436 437 //// Topology functions 438 439 /// reorganizing shape while it hold total elements a.k.a. reshape. 440 /// At most one dimension of the new shape can be -1. 441 /// In this case, the value is inferred from the size of the tensor and the remaining dimensions. 442 auto view(T, size_t sdim, size_t tdim, alias Storage)(Variable!(T, sdim, 443 Storage) x, ptrdiff_t[tdim] shape...) { 444 import grain.functions; 445 446 auto func = new grain.functions.View!(T, sdim, tdim, Storage)(shape); 447 return func.applyForward(x); 448 } 449 450 /// 451 unittest { 452 import numir; 453 import mir.ndslice; 454 import grain.testing; 455 456 auto hx = uniform!float(6, 4).slice.variable(true); 457 auto hgy = uniform!float(2, 3, 4).slice.variable; 458 auto ugy = UntypedVariable(hgy); 459 auto hy = hx.view([2, 3, -1]); 460 assert(hy.sliced == numir.view(hx.sliced, [2, 3, -1])); 461 hy.backward(&ugy); 462 assert(hx.gradSliced == numir.view(hgy.sliced, [6, 4])); 463 // gradCheckChain!(x => x.view([2, 3, -1]))(hx, hgy, 1e-3, 5e-2, 5e-2); 464 } 465 466 467 /// squeeze/remove redundant size-1 dimension (axis) d 468 auto squeeze(size_t d, T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 469 import grain.utility : castArray; 470 static assert(dim >= 1); 471 assert(x.shape[d] == 1); 472 ptrdiff_t[dim-1] s; 473 s[0..d] = x.shape[0..d].castArray!ptrdiff_t; 474 s[d..$] = x.shape[d+1..$].castArray!ptrdiff_t; 475 return x.view(s); 476 } 477 478 /// 479 unittest { 480 import mir.ndslice; 481 auto x = iota(3, 4, 1, 5).as!double.slice.variable; 482 assert(x.squeeze!2.shape == [3, 4, 5]); 483 } 484 485 486 /// unsqueeze/add redundant size-1 dimension (axis) d 487 auto unsqueeze(size_t d, T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 488 import grain.utility : castArray; 489 static assert(dim >= d); 490 ptrdiff_t[dim+1] s; 491 s[0..d] = x.shape[0..d].castArray!ptrdiff_t; 492 s[d] = 1; 493 s[d+1..$] = x.shape[d..$].castArray!ptrdiff_t; 494 return x.view(s); 495 } 496 497 /// 498 unittest { 499 import mir.ndslice; 500 auto x = iota(3, 4, 5).as!double.slice.variable; 501 assert(x.unsqueeze!2.shape == [3, 4, 1, 5]); 502 } 503 504 /// summation 505 auto sum(string mode = "fast", T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) { 506 import grain.functions.reduction : Sum; 507 auto f = new Sum!(mode, T, dim); 508 return f.applyForward(x); 509 } 510 511 /// 512 unittest { 513 import grain.testing; 514 import numir; 515 import mir.ndslice; 516 import std.meta; 517 518 auto hx = [1f, 2f, 3f, 4f].sliced(2, 2).variable(true); 519 auto loss = sum(hx); 520 assert(loss.data[0] == 10f); 521 loss.backward(); 522 assert(hx.gradSlice == [1f, 1f, 1f, 1f].sliced(2, 2)); 523 } 524 525 526 /// dropout : apply random mask 527 auto dropout(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x, float ratio=0.5, bool isTrain=true) { 528 import grain.functions.random : Dropout; 529 530 if (!isTrain || ratio == 0.0) return x; 531 532 auto f = new Dropout!(T, dim)(ratio); 533 return f.applyForward(x); 534 } 535 536 /// 537 unittest { 538 import grain.testing; 539 import numir; 540 import mir.ndslice; 541 import std.meta; 542 import std.stdio; 543 import std.math; 544 545 auto hx = [1f, 2f, 3f, 4f].sliced(2, 2).variable(true); 546 auto hy = dropout(hx); 547 hy.sum.backward(); 548 auto ghx = hx.gradSliced; 549 foreach (i; 0 .. hx.shape[0]) { 550 foreach (j; 0 .. hx.shape[1]) { 551 if (approxEqual(hy.sliced[i, j], 2.0 * hx.sliced[i, j])) { 552 assert(ghx[i, j] == 2.0f); 553 } 554 else { 555 assert(hy.sliced[i, j] == 0.0f); 556 assert(ghx[i, j] == 0.0f); 557 } 558 } 559 } 560 }