1 /**
2    A module for loss functions that always output scalar values to be minimized.
3    Loss function is the end of forwardprop and also is the start point of backprop.
4  */
5 module grain.functions.loss;
6 
7 import grain.autograd;
8 import grain.cuda;
9 import grain.functions.common;
10 import grain.utility : toTuple, fromTuple, castArray;
11 
12 struct NegativeLogLikelihood(F, I = long) {
13     /++
14     Compute negative log-likelihood: -logP(y=t)
15     Params:
16       logP: log softmax output as prediction. shape: (nBatch, nClass)
17       targetId: target integer id of class. shape: (nBatch)
18       +/
19 
20     mixin FunctionCommon;
21 
22     bool sizeAverage = true;
23     int ignoreIndex = -100;
24     // TODO: bool reduce = true;
25 
26     // cache for backward
27     Variable!(I, 1, HostStorage) _htargetId;
28     F _normalize;
29     int _nClass;
30 
31     auto forward(Variable!(F, 2, HostStorage) logP, Variable!(I, 1, HostStorage) targetId) {
32         import mir.math;
33         import mir.ndslice;
34 
35         F result = 0.0;
36         size_t count = 0;
37         foreach (i; 0 .. targetId.sliced.length) {
38             auto t = targetId.sliced[i];
39             if (t != this.ignoreIndex) {
40                 result -= logP.sliced[i, t];
41                 ++count;
42             }
43         }
44         if (this.sizeAverage && count > 0) {
45             result /= count;
46         }
47         // TODO if train
48         this._nClass = logP.shape[1];
49         this._htargetId = targetId;
50         this._normalize = this.sizeAverage && count > 0 ? 1.0 / count : 1.0;
51         return result.variable;
52     }
53 
54     auto backward(Variable!(F, 0, HostStorage) gy) {
55         import std.typecons;
56         import mir.math;
57         import mir.ndslice;
58         import numir;
59 
60         auto nBatch = this._htargetId.shape[0];
61         auto glogP = zeros!F(nBatch, this._nClass);
62         auto coeff = gy.data[0] * this._normalize;
63         foreach (i; 0 .. nBatch) {
64             auto t = this._htargetId.sliced[i];
65             if (t != this.ignoreIndex) {
66                 glogP[i][t] = -coeff;
67             }
68         }
69         return tuple(glogP.variable, typeof(this._htargetId)());
70     }
71 
72     version (grain_cuda) {
73         Variable!(I, 1, DeviceStorage) _dtargetId;
74         auto forward(Variable!(F, 2, DeviceStorage) logP, Variable!(I, 1, DeviceStorage) targetId) {
75             static assert(is(F == float), "only float is supported now");
76             static assert(is(I == int), "only int is supported now");
77 
78             import grain.kernel : nll;
79 
80             this._nClass = logP.shape[1];
81             auto dresult = CuArray!F([0]); // [result].variable.to!DeviceStorage; <- FIXME
82             auto dcount = CuArray!int([0]); // [count].variable.to!DeviceStorage;
83 
84             auto batchSize = targetId.shape[0];
85             Global.kernel!nll.call(dresult.ptr, dcount.ptr, logP.data.ptr,
86                     targetId.data.ptr, this.ignoreIndex, batchSize, logP.strides[
87                     0]).launch(batchSize);
88 
89             F result = 0.0;
90             int count = 0;
91             dresult.toHost(&result);
92             dcount.toHost(&count);
93 
94             if (this.sizeAverage && count > 0) {
95                 result /= count;
96             }
97             // TODO if train
98             this._nClass = logP.shape[1];
99             this._dtargetId = targetId;
100             this._normalize = this.sizeAverage && count > 0 ? 1.0 / count : 1.0;
101             return result.variable.to!DeviceStorage;
102         }
103 
104         auto backward(Variable!(F, 0, DeviceStorage) gy) {
105             static assert(is(F == float), "only float is supported now");
106             static assert(is(I == int), "only int is supported now");
107 
108             import grain.kernel;
109             import std.typecons : tuple;
110 
111             auto nBatch = this._dtargetId.shape[0];
112             auto glogP = CuArray!F(nBatch * this._nClass);
113             glogP.zero_();
114             auto coeff = gy.to!HostStorage.data[0] * this._normalize;
115             Global.kernel!nllGrad.call(glogP.ptr, -coeff,
116                     this._dtargetId.data.ptr,
117                     this.ignoreIndex, nBatch, this._nClass).launch(nBatch);
118             auto v = Variable!(F, 2, DeviceStorage)(false, [nBatch,
119                     this._nClass], [this._nClass, 1], glogP);
120             return tuple(v, typeof(this._dtargetId)());
121         }
122 
123     }
124 }
125 
126 /// test nll simple case, gradcheck and cpu/cuda equality
127 unittest {
128     /++ equivalent torch v0.4 code
129      >>> x = torch.FloatTensor([[0.2, 0.4, 0.4], [0.1,0.5,0.4]])
130      >>> x.requires_grad = True
131      >>> t = torch.LongTensor([1, 0])
132      >>> l = torch.nn.functional.nll_loss(x, t)
133      >>> print(l)
134      tensor(-0.2500)
135 
136      >>> l.backward()
137      >>> print(x.grad)
138      tensor([[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0]])
139      +/
140     import std.typecons;
141     import grain.testing;
142 
143     NegativeLogLikelihood!(float, int) func;
144     auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]]
145         .variable;
146     auto ht = [1, 0, func.ignoreIndex].variable;
147     auto hl = func.forward(hx, ht);
148     assert(func._normalize == 0.5);
149     assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]);
150     auto hgx = func.backward(1.0f.variable);
151     assert(hgx[0].sliced == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0, 0.0]]);
152     assert(!hgx[1].defined);
153     gradCheck(func, tuple(hx, ht), 1.0f.variable);
154 
155     version (grain_cuda) {
156         auto dx = hx.to!DeviceStorage;
157         auto dt = ht.to!DeviceStorage;
158         auto dl = func.forward(dx, dt);
159         assert(func._normalize == 0.5);
160         assert(dl.to!HostStorage.sliced == [-(0.4f + 0.1f + 0.0f) / 2]);
161         auto dgx = func.backward(1.0f.variable.to!DeviceStorage);
162         assert(dgx[0].to!HostStorage.sliced ==
163                [[0.0, -0.5, 0.0],
164                 [-0.5, 0.0, 0.0],
165                 [0.0, 0.0, 0.0]]);
166         assert(!dgx[1].defined);
167     }
168 }
169 
170 /// test variable.backward
171 unittest {
172     import std.typecons;
173     import grain.testing;
174     import mir.ndslice;
175     static import grain.config;
176 
177     grain.config.backprop = true;
178 
179     NegativeLogLikelihood!(float, int) func;
180     auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]]
181         .variable;
182     hx.requiresGrad = true;
183     auto ht = [1, 0, func.ignoreIndex].variable;
184     auto hl = func.applyForward(hx, ht);
185 
186     assert(func._normalize == 0.5);
187     assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]);
188     auto u = UntypedVariable(1.0f.variable);
189     hl.backward(&u);
190 
191     assert(hx.grad[].sliced(3, 3) == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0,
192             0.0]]);
193     // TODO assert(!ht.grad.defined);
194 }
195 
196 struct HuberLoss(T) {
197     auto forward() {
198 
199     }
200 }
201 
202 /+
203 /**
204    PyTorch equality check
205  */
206 unittest {
207     import std.typecons;
208     import grain.testing;
209     import mir.ndslice;
210     static import grain.config;
211 
212     grain.config.backprop = true;
213 
214     HuberLoss!float func;
215     auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]]
216         .variable;
217     hx.requiresGrad = true;
218     auto ht = [1, 0, func.ignoreIndex].variable;
219     auto hl = func.applyForward(hx, ht);
220 
221     assert(func._normalize == 0.5);
222     assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]);
223     auto u = UntypedVariable(1.0f.variable);
224     hl.backward(&u);
225 
226     assert(hx.grad[].sliced(3, 3) == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0,
227             0.0]]);
228 }
229 +/