1 /++
2 Chain means autograd operators in grain that is equivalent to
3 - pytorch: torch.nn.Module
4 - chainer: chainer.Chain or chainer.Link
5 
6 Users cannot apply grain.functions to Variable without new or applyForward.
7 Instead of that, you can apply grain.chains to Variable with opCall.
8 
9 TODO test chains as functions
10  +/
11 module grain.chain;
12 
13 import std.traits : isFloatingPoint;
14 import numir : normal;
15 
16 import grain.autograd; // : Variable, variable, to;
17 version (grain_cuda) {
18     import grain.cuda : zero_;
19 }
20 
21 // enum isChain(T) = {
22 //     import std.traits;
23 //     import std.meta;
24 //     alias R = ReturnType!(T.init);
25 //     if (isVariable!R) return true;
26 //     if (isTuple!() AllSatisfy!(isVariable, ReturnType!(T.init));
27 // }();
28 
29 //////// Unary functions
30 
31 /// rectified linear unit nonlinearity
32 auto relu(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
33     import grain.functions : ReLU;
34 
35     auto func = new ReLU!(T, dim);
36     return func.applyForward(x);
37 }
38 
39 /// sigmoid nonlinearity
40 auto sigmoid(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
41     import grain.functions : Sigmoid;
42 
43     auto func = new Sigmoid!(T, dim);
44     return func.applyForward(x);
45 }
46 
47 /// tanh nonlinearity
48 auto tanh(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
49     import grain.functions : Tanh;
50 
51     auto func = new Tanh!(T, dim);
52     return func.applyForward(x);
53 }
54 
55 /// 1 / x
56 auto reciprocal(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
57     import grain.functions : Reciprocal;
58 
59     auto func = new Reciprocal!(T, dim);
60     return func.applyForward(x);
61 }
62 
63 /// -x
64 auto neg(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
65     import grain.functions : Neg;
66 
67     auto func = new Neg!(T, dim);
68     return func.applyForward(x);
69 }
70 
71 /// exp x
72 auto exp(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
73     import grain.functions.unary : Exp;
74 
75     auto func = new Exp!(T, dim);
76     return func.applyForward(x);
77 }
78 
79 /// log x
80 auto log(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
81     import grain.functions.unary : Log;
82 
83     auto func = new Log!(T, dim);
84     return func.applyForward(x);
85 }
86 
87 /// sin x
88 auto sin(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
89     import grain.functions.unary : Sin;
90 
91     auto func = new Sin!(T, dim);
92     return func.applyForward(x);
93 }
94 
95 /// cos x
96 auto cos(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
97     import grain.functions.unary : Cos;
98 
99     auto func = new Cos!(T, dim);
100     return func.applyForward(x);
101 }
102 
103 /// tan x
104 auto tan(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
105     import grain.functions.unary : Tan;
106 
107     auto func = new Tan!(T, dim);
108     return func.applyForward(x);
109 }
110 
111 /// abs
112 auto  abs(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
113     import grain.functions.unary : Abs;
114 
115     auto func = new Abs!(T, dim);
116     return func.applyForward(x);
117 }
118 
119 /// pow
120 auto pow(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x, T power) {
121     import grain.functions.unary : Pow;
122 
123     auto func = new Pow!(T, dim)(power);
124     return func.applyForward(x);
125 }
126 
127 /// log exp(x_i) / sum_i (exp(x_i))
128 auto logSoftmax(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
129     import grain.functions.unary : LogSoftmax;
130 
131     auto func = new LogSoftmax!(T, dim);
132     return func.applyForward(x);
133 }
134 
135 /// test fast math functions
136 unittest {
137     import grain.testing;
138     import numir;
139     import mir.ndslice;
140     import std.meta;
141 
142     foreach (f; AliasSeq!(sigmoid, tanh, reciprocal, neg, exp, log, sin, cos,
143             tan, x => pow(x, 2.0f), logSoftmax)) {
144         auto hx = uniform!float(2, 3).slice.variable(true);
145         auto hgy = uniform!float(2, 3).slice.variable;
146         gradCheckChain!f(hx, hgy, 1e-3, 5e-2, 5e-2);
147     }
148 }
149 
150 /////// Loss
151 
152 /// negative loglikelihood - log p(x). note that p(x) should be normalized
153 auto negativeLogLikelihood(alias Storage)(Variable!(float, 2, Storage) x,
154         Variable!(int, 1, Storage) t, int ignoreIndex = -100) {
155     import grain.functions : NegativeLogLikelihood;
156 
157     auto nll = new NegativeLogLikelihood!(float, int);
158     nll.ignoreIndex = ignoreIndex;
159     return nll.applyForward(x, t);
160 }
161 
162 /// cross entropy loss (logsoftmax -> negative loglikelihood function)
163 auto crossEntropy(alias Storage)(Variable!(float, 2, Storage) x,
164         Variable!(int, 1, Storage) t, int ignoreIndex = -100) {
165     import grain.functions : NegativeLogLikelihood;
166 
167     auto y = logSoftmax(x);
168     return negativeLogLikelihood(y, t, ignoreIndex);
169 }
170 
171 /// test variable.backward
172 unittest {
173     /* pytorch equivalent
174        >>> x = torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]], requires_grad=True)
175        >>> t = torch.tensor([1, 0, -100], dtype=torch.long)
176        >>> l = torch.nn.functional.cross_entropy(x, t)
177        >>> l
178        tensor(0.6944)
179        >>> l.backward()
180        >>> x.grad
181        tensor([[ 0.2375, -0.2375],
182                [-0.2625,  0.2625],
183                [ 0.0000,  0.0000]])
184      */
185     import std.stdio;
186     import std.typecons;
187     import mir.ndslice;
188     import grain.autograd;
189     import numir;
190     static import grain.config;
191     grain.config.backprop = true;
192 
193     auto hx = [[0.1f, 0.2f], [0.3f, 0.4f], [0.5f, 0.6f]].variable(true);
194     auto ht = [1, 0, -100].variable;
195     auto hl = crossEntropy(hx, ht);
196     hl.backward();
197     assert(approxEqual(hx.gradSliced, [[0.2375, -0.2375], [-0.2625, 0.2625],
198             [0.0000, 0.0000]].nparray));
199 
200     version (grain_cuda) {
201         auto dx = hx.to!DeviceStorage;
202         dx.grad.zero_();
203         auto dt = ht.to!DeviceStorage;
204         auto dl = crossEntropy(dx, dt);
205         assert(approxEqual(hl.sliced, dl.to!HostStorage.sliced));
206         dl.backward();
207         assert(approxEqual(dx.to!HostStorage.gradSliced, hx.gradSliced));
208     }
209 }
210 
211 /// Binary functions
212 
213 /**
214   op(alpha1 * a, alpha2 * b)
215 
216   this function is tested under grain.autograd.Variable.opBinary
217 */
218 auto opBinaryFunc(string op, T, size_t dim, alias Storage)(Variable!(T, dim,
219         Storage) a, Variable!(T, dim, Storage) b, T alpha1 = 1, T alpha2 = 1) {
220     import grain.functions : OpBinary;
221 
222     auto func = new OpBinary!(T, dim, op)(alpha1, alpha2);
223     return func.applyForward(a, b);
224 }
225 
226 /// matrix x matrix multiplication
227 auto matMul(T, alias Storage)(Variable!(T, 2, Storage) a, Variable!(T, 2, Storage) b) {
228     import grain.functions : MatMul;
229 
230     auto func = new MatMul!T;
231     return func.applyForward(a, b);
232 }
233 
234 /// matrix + vector row-wise addition. TODO replace this with broadcasted addition
235 auto addVec(T, alias Storage)(Variable!(T, 2, Storage) a, Variable!(T, 1, Storage) b) {
236     import grain.functions : AddBias;
237 
238     auto func = new AddBias!T;
239     return func.applyForward(a, b);
240 }
241 
242 /// tensor convolution (do cross correlation default)
243 auto convolution(bool isConv = false, bool isNchw = true, T, size_t dim, size_t imDim,
244         alias Storage)(Variable!(T, dim, Storage) x, Variable!(T, dim,
245         Storage) w, int[imDim] stride, int[imDim] pad, int[imDim] dilation) {
246     static assert(dim == imDim + 2);
247     import grain.functions : Convolution;
248 
249     static assert(dim > 2); // TODO support 1d, 2d inputs
250     auto func = new Convolution!(T, dim - 2, isConv, isNchw);
251     func.stride = stride;
252     func.pad = pad;
253     func.dilation = dilation;
254     return func.applyForward(x, w);
255 }
256 
257 ////// Parametric chains
258 
259 /// convolution operator
260 struct Convolution(T, size_t dim, alias Storage) {
261     import grain.utility : castArray;
262     import mir.ndslice : slice;
263 
264     Variable!(T, dim + 2, Storage) weight;
265     Variable!(T, dim + 2, Storage) bias; // TODO implement unsqueeze
266     int nInput, nOutput;
267     bool useBias = true;
268     int[dim] kernel, stride, pad, dilation;
269 
270     auto outShape(uint[dim + 2] inShape) {
271         import grain.functions;
272 
273         auto func = grain.functions.Convolution!(T, dim)(this.stride, this.pad, this
274                 .dilation);
275         return func.outShape(inShape, this.weight.shape);
276     }
277 
278     ///
279     this(int nInput, int nOutput, int[dim] kernel, bool useBias = true) {
280         this.nInput = nInput;
281         this.nOutput = nOutput;
282         this.kernel = kernel;
283         this.stride[] = 1;
284         this.pad[] = 0;
285         this.dilation[] = 1;
286         this.useBias = useBias;
287         this.resetParameters();
288     }
289 
290     ///
291     this(int nInput, int nOutput, int[dim] kernel, int[dim] stride, int[dim] pad,
292             int[dim] dilation, bool useBias = true) {
293         this.nInput = nInput;
294         this.nOutput = nOutput;
295         this.kernel = kernel;
296         this.stride = stride;
297         this.pad = pad;
298         this.dilation = dilation;
299         this.useBias = useBias;
300         this.resetParameters();
301     }
302 
303     /// pytorch style init (LeCun uniform init)
304     void resetParameters() {
305         import std.algorithm : reduce;
306         import numir : generate;
307         import mir.random.variable : UniformVariable;
308 
309         // TODO create initializer
310         const receptiveSize = this.kernel.reduce!"a * b";
311         const fanIn = this.nInput * receptiveSize;
312         const fanOut = this.nOutput * receptiveSize;
313         auto stdv = 1.0 / (cast(T) fanIn ^^ 0.5);
314         int[dim + 2] wshape;
315         wshape[0] = this.nOutput;
316         wshape[1] = this.nInput;
317         wshape[2 .. $] = this.kernel;
318         this.weight = UniformVariable!T(-stdv, stdv)
319             .generate(wshape.castArray!size_t).slice.variable(true).to!Storage;
320         if (this.useBias) {
321             int[dim + 2] bshape;
322             bshape[] = 1;
323             bshape[1] = this.nOutput;
324             this.bias = UniformVariable!T(-stdv, stdv).generate(
325                     bshape.castArray!size_t).slice.variable(true).to!Storage;
326         }
327     }
328 
329     ///
330     auto opCall(Variable!(T, dim + 2, Storage) x) {
331         auto wx = convolution(x, this.weight, this.stride, this.pad, this
332                 .dilation);
333         if (this.useBias) {
334             return wx + this.bias;
335         }
336         else {
337             return wx;
338         }
339     }
340 }
341 
342 ///
343 unittest {
344     import grain.testing;
345     import grain.utility;
346     import numir;
347     import mir.ndslice;
348 
349     auto conv = Convolution!(float, 2, HostStorage)(3, 4, [3, 3]);
350     auto x = uniform!float(2, 3, 4, 4).slice.variable(true);
351     auto y = conv(x);
352     auto gy = uniform!float(y.shape.castArray!size_t).slice.variable;
353     gradCheckChain!conv(x, gy, 1e-3, 5e-2, 5e-2);
354 }
355 
356 /// linear operator
357 struct Linear(T, alias Storage) if (isFloatingPoint!T) {
358     import mir.ndslice : slice;
359 
360     Variable!(T, 2, Storage) weight;
361     Variable!(T, 1, Storage) bias;
362     int nInput, nOutput;
363     bool useBias = true;
364 
365     this(int nInput, int nOutput, bool useBias = true) {
366         this.nInput = nInput;
367         this.nOutput = nOutput;
368         this.useBias = useBias;
369         this.resetParameters();
370     }
371 
372     // pytorch style init (LeCun uniform init)
373     void resetParameters() {
374         import numir : generate;
375         import mir.random.variable : UniformVariable;
376 
377         auto stdv = 1.0 / (cast(T) this.nOutput ^^ 0.5);
378         this.weight = UniformVariable!T(-stdv, stdv).generate(this.nInput,
379                 this.nOutput).slice.variable(true).to!Storage;
380         if (this.useBias) {
381             this.bias = UniformVariable!T(-stdv, stdv).generate(this.nOutput)
382                 .slice.variable(true).to!Storage;
383         }
384     }
385 
386     auto opCall(Variable!(T, 2, Storage) x) {
387         auto wx = matMul(x, this.weight);
388         if (this.useBias) {
389             return addVec(wx, this.bias);
390         }
391         else {
392             return wx;
393         }
394     }
395 }
396 
397 ///
398 unittest {
399     import grain.testing;
400     import grain.utility;
401     import numir;
402     import mir.ndslice;
403 
404     auto f = Linear!(float, HostStorage)(2, 3);
405     auto x = uniform!float(2, 2).slice.variable(true);
406     auto y = f(x);
407     auto gy = uniform!float(y.shape.castArray!size_t).slice.variable;
408     gradCheckChain!f(x, gy, 1e-3, 5e-2, 5e-2);
409 }
410 
411 /// Emebedding ID into vector
412 struct Embedding(T, alias Storage) if (isFloatingPoint!T) {
413     import mir.ndslice : slice;
414 
415     Variable!(T, 2, Storage) weight;
416     uint nVocab, nEmbed;
417 
418     this(uint nVocab, uint nEmbed) {
419         this.nVocab = nVocab;
420         this.nEmbed = nEmbed;
421         this.resetParameters();
422     }
423 
424     void resetParameters() {
425         import numir : normal;
426         this.weight = normal!T(this.nVocab, this.nEmbed).slice.variable(true).to!Storage;
427     }
428 
429     auto opCall(Variable!(int, 1, Storage) ids) {
430         import grain.functions;
431 
432         auto func = new grain.functions.Embedding!T;
433         return func.applyForward(this.weight, ids);
434     }
435 }
436 
437 //// Topology functions
438 
439 /// reorganizing shape while it hold total elements a.k.a. reshape.
440 /// At most one dimension of the new shape can be -1.
441 /// In this case, the value is inferred from the size of the tensor and the remaining dimensions.
442 auto view(T, size_t sdim, size_t tdim, alias Storage)(Variable!(T, sdim,
443         Storage) x, ptrdiff_t[tdim] shape...) {
444     import grain.functions;
445 
446     auto func = new grain.functions.View!(T, sdim, tdim, Storage)(shape);
447     return func.applyForward(x);
448 }
449 
450 ///
451 unittest {
452     import numir;
453     import mir.ndslice;
454     import grain.testing;
455 
456     auto hx = uniform!float(6, 4).slice.variable(true);
457     auto hgy = uniform!float(2, 3, 4).slice.variable;
458     auto ugy = UntypedVariable(hgy);
459     auto hy = hx.view([2, 3, -1]);
460     assert(hy.sliced == numir.view(hx.sliced, [2, 3, -1]));
461     hy.backward(&ugy);
462     assert(hx.gradSliced == numir.view(hgy.sliced, [6, 4]));
463     // gradCheckChain!(x => x.view([2, 3, -1]))(hx, hgy, 1e-3, 5e-2, 5e-2);
464 }
465 
466 
467 /// squeeze/remove redundant size-1 dimension (axis) d
468 auto squeeze(size_t d, T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
469     import grain.utility : castArray;
470     static assert(dim >= 1);
471     assert(x.shape[d] == 1);
472     ptrdiff_t[dim-1] s;
473     s[0..d] = x.shape[0..d].castArray!ptrdiff_t;
474     s[d..$] = x.shape[d+1..$].castArray!ptrdiff_t;
475     return x.view(s);
476 }
477 
478 ///
479 unittest {
480     import mir.ndslice;
481     auto x = iota(3, 4, 1, 5).as!double.slice.variable;
482     assert(x.squeeze!2.shape == [3, 4, 5]);
483 }
484 
485 
486 /// unsqueeze/add redundant size-1 dimension (axis) d
487 auto unsqueeze(size_t d, T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
488     import grain.utility : castArray;
489     static assert(dim >= d);
490     ptrdiff_t[dim+1] s;
491     s[0..d] = x.shape[0..d].castArray!ptrdiff_t;
492     s[d] = 1;
493     s[d+1..$] = x.shape[d..$].castArray!ptrdiff_t;
494     return x.view(s);
495 }
496 
497 ///
498 unittest {
499     import mir.ndslice;
500     auto x = iota(3, 4, 5).as!double.slice.variable;
501     assert(x.unsqueeze!2.shape == [3, 4, 1, 5]);
502 }
503 
504 /// summation
505 auto sum(string mode = "fast", T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x) {
506     import grain.functions.reduction : Sum;
507     auto f = new Sum!(mode, T, dim);
508     return f.applyForward(x);
509 }
510 
511 ///
512 unittest {
513     import grain.testing;
514     import numir;
515     import mir.ndslice;
516     import std.meta;
517 
518     auto hx = [1f, 2f, 3f, 4f].sliced(2, 2).variable(true);
519     auto loss = sum(hx);
520     assert(loss.data[0] == 10f);
521     loss.backward();
522     assert(hx.gradSlice == [1f, 1f, 1f, 1f].sliced(2, 2));
523 }
524 
525 
526 /// dropout : apply random mask
527 auto dropout(T, size_t dim, alias Storage)(Variable!(T, dim, Storage) x, float ratio=0.5, bool isTrain=true) {
528     import grain.functions.random : Dropout;
529 
530     if (!isTrain || ratio == 0.0) return x;
531 
532     auto f = new Dropout!(T, dim)(ratio);
533     return f.applyForward(x);
534 }
535 
536 ///
537 unittest {
538     import grain.testing;
539     import numir;
540     import mir.ndslice;
541     import std.meta;
542     import std.stdio;
543     import std.math;
544 
545     auto hx = [1f, 2f, 3f, 4f].sliced(2, 2).variable(true);
546     auto hy = dropout(hx);
547     hy.sum.backward();
548     auto ghx = hx.gradSliced;
549     foreach (i; 0 .. hx.shape[0]) {
550         foreach (j; 0 .. hx.shape[1]) {
551             if (approxEqual(hy.sliced[i, j], 2.0 * hx.sliced[i, j])) {
552                 assert(ghx[i, j] == 2.0f);
553             }
554             else {
555                 assert(hy.sliced[i, j] == 0.0f);
556                 assert(ghx[i, j] == 0.0f);
557             }
558         }
559     }
560 }