1 /** 2 A module for loss functions that always output scalar values to be minimized. 3 Loss function is the end of forwardprop and also is the start point of backprop. 4 */ 5 module grain.functions.loss; 6 7 import grain.autograd; 8 import grain.cuda; 9 import grain.functions.common; 10 import grain.utility : toTuple, fromTuple, castArray; 11 12 struct NegativeLogLikelihood(F, I = long) { 13 /++ 14 Compute negative log-likelihood: -logP(y=t) 15 Params: 16 logP: log softmax output as prediction. shape: (nBatch, nClass) 17 targetId: target integer id of class. shape: (nBatch) 18 +/ 19 20 mixin FunctionCommon; 21 22 bool sizeAverage = true; 23 int ignoreIndex = -100; 24 // TODO: bool reduce = true; 25 26 // cache for backward 27 Variable!(I, 1, HostStorage) _htargetId; 28 F _normalize; 29 int _nClass; 30 31 auto forward(Variable!(F, 2, HostStorage) logP, Variable!(I, 1, HostStorage) targetId) { 32 import mir.math; 33 import mir.ndslice; 34 35 F result = 0.0; 36 size_t count = 0; 37 foreach (i; 0 .. targetId.sliced.length) { 38 auto t = targetId.sliced[i]; 39 if (t != this.ignoreIndex) { 40 result -= logP.sliced[i, t]; 41 ++count; 42 } 43 } 44 if (this.sizeAverage && count > 0) { 45 result /= count; 46 } 47 // TODO if train 48 this._nClass = logP.shape[1]; 49 this._htargetId = targetId; 50 this._normalize = this.sizeAverage && count > 0 ? 1.0 / count : 1.0; 51 return result.variable; 52 } 53 54 auto backward(Variable!(F, 0, HostStorage) gy) { 55 import std.typecons; 56 import mir.math; 57 import mir.ndslice; 58 import numir; 59 60 auto nBatch = this._htargetId.shape[0]; 61 auto glogP = zeros!F(nBatch, this._nClass); 62 auto coeff = gy.data[0] * this._normalize; 63 foreach (i; 0 .. nBatch) { 64 auto t = this._htargetId.sliced[i]; 65 if (t != this.ignoreIndex) { 66 glogP[i][t] = -coeff; 67 } 68 } 69 return tuple(glogP.variable, typeof(this._htargetId)()); 70 } 71 72 version (grain_cuda) { 73 Variable!(I, 1, DeviceStorage) _dtargetId; 74 auto forward(Variable!(F, 2, DeviceStorage) logP, Variable!(I, 1, DeviceStorage) targetId) { 75 static assert(is(F == float), "only float is supported now"); 76 static assert(is(I == int), "only int is supported now"); 77 78 import grain.kernel : nll; 79 80 this._nClass = logP.shape[1]; 81 auto dresult = CuArray!F([0]); // [result].variable.to!DeviceStorage; <- FIXME 82 auto dcount = CuArray!int([0]); // [count].variable.to!DeviceStorage; 83 84 auto batchSize = targetId.shape[0]; 85 Global.kernel!nll.call(dresult.ptr, dcount.ptr, logP.data.ptr, 86 targetId.data.ptr, this.ignoreIndex, batchSize, logP.strides[ 87 0]).launch(batchSize); 88 89 F result = 0.0; 90 int count = 0; 91 dresult.toHost(&result); 92 dcount.toHost(&count); 93 94 if (this.sizeAverage && count > 0) { 95 result /= count; 96 } 97 // TODO if train 98 this._nClass = logP.shape[1]; 99 this._dtargetId = targetId; 100 this._normalize = this.sizeAverage && count > 0 ? 1.0 / count : 1.0; 101 return result.variable.to!DeviceStorage; 102 } 103 104 auto backward(Variable!(F, 0, DeviceStorage) gy) { 105 static assert(is(F == float), "only float is supported now"); 106 static assert(is(I == int), "only int is supported now"); 107 108 import grain.kernel; 109 import std.typecons : tuple; 110 111 auto nBatch = this._dtargetId.shape[0]; 112 auto glogP = CuArray!F(nBatch * this._nClass); 113 glogP.zero_(); 114 auto coeff = gy.to!HostStorage.data[0] * this._normalize; 115 Global.kernel!nllGrad.call(glogP.ptr, -coeff, 116 this._dtargetId.data.ptr, 117 this.ignoreIndex, nBatch, this._nClass).launch(nBatch); 118 auto v = Variable!(F, 2, DeviceStorage)(false, [nBatch, 119 this._nClass], [this._nClass, 1], glogP); 120 return tuple(v, typeof(this._dtargetId)()); 121 } 122 123 } 124 } 125 126 /// test nll simple case, gradcheck and cpu/cuda equality 127 unittest { 128 /++ equivalent torch v0.4 code 129 >>> x = torch.FloatTensor([[0.2, 0.4, 0.4], [0.1,0.5,0.4]]) 130 >>> x.requires_grad = True 131 >>> t = torch.LongTensor([1, 0]) 132 >>> l = torch.nn.functional.nll_loss(x, t) 133 >>> print(l) 134 tensor(-0.2500) 135 136 >>> l.backward() 137 >>> print(x.grad) 138 tensor([[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0]]) 139 +/ 140 import std.typecons; 141 import grain.testing; 142 143 NegativeLogLikelihood!(float, int) func; 144 auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]] 145 .variable; 146 auto ht = [1, 0, func.ignoreIndex].variable; 147 auto hl = func.forward(hx, ht); 148 assert(func._normalize == 0.5); 149 assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]); 150 auto hgx = func.backward(1.0f.variable); 151 assert(hgx[0].sliced == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0, 0.0]]); 152 assert(!hgx[1].defined); 153 gradCheck(func, tuple(hx, ht), 1.0f.variable); 154 155 version (grain_cuda) { 156 auto dx = hx.to!DeviceStorage; 157 auto dt = ht.to!DeviceStorage; 158 auto dl = func.forward(dx, dt); 159 assert(func._normalize == 0.5); 160 assert(dl.to!HostStorage.sliced == [-(0.4f + 0.1f + 0.0f) / 2]); 161 auto dgx = func.backward(1.0f.variable.to!DeviceStorage); 162 assert(dgx[0].to!HostStorage.sliced == 163 [[0.0, -0.5, 0.0], 164 [-0.5, 0.0, 0.0], 165 [0.0, 0.0, 0.0]]); 166 assert(!dgx[1].defined); 167 } 168 } 169 170 /// test variable.backward 171 unittest { 172 import std.typecons; 173 import grain.testing; 174 import mir.ndslice; 175 static import grain.config; 176 177 grain.config.backprop = true; 178 179 NegativeLogLikelihood!(float, int) func; 180 auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]] 181 .variable; 182 hx.requiresGrad = true; 183 auto ht = [1, 0, func.ignoreIndex].variable; 184 auto hl = func.applyForward(hx, ht); 185 186 assert(func._normalize == 0.5); 187 assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]); 188 auto u = UntypedVariable(1.0f.variable); 189 hl.backward(&u); 190 191 assert(hx.grad[].sliced(3, 3) == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0, 192 0.0]]); 193 // TODO assert(!ht.grad.defined); 194 } 195 196 struct HuberLoss(T) { 197 auto forward() { 198 199 } 200 } 201 202 /+ 203 /** 204 PyTorch equality check 205 */ 206 unittest { 207 import std.typecons; 208 import grain.testing; 209 import mir.ndslice; 210 static import grain.config; 211 212 grain.config.backprop = true; 213 214 HuberLoss!float func; 215 auto hx = [[0.2f, 0.4f, 0.4f], [0.1f, 0.5f, 0.4f], [0.1f, 0.5f, 0.4f]] 216 .variable; 217 hx.requiresGrad = true; 218 auto ht = [1, 0, func.ignoreIndex].variable; 219 auto hl = func.applyForward(hx, ht); 220 221 assert(func._normalize == 0.5); 222 assert(hl.sliced == [-(0.4f + 0.1f + 0.0f) / 2]); 223 auto u = UntypedVariable(1.0f.variable); 224 hl.backward(&u); 225 226 assert(hx.grad[].sliced(3, 3) == [[0.0, -0.5, 0.0], [-0.5, 0.0, 0.0], [0.0, 0.0, 227 0.0]]); 228 } 229 +/