1 module grain.warpctc; 2 3 extern (C): 4 5 //forward declare of CUDA typedef to avoid needing to pull in CUDA headers 6 struct CUstream_st; 7 alias CUstream = CUstream_st*; 8 9 alias ctcStatus_t = int; 10 enum : ctcStatus_t 11 { 12 CTC_STATUS_SUCCESS = 0, 13 CTC_STATUS_MEMOPS_FAILED = 1, 14 CTC_STATUS_INVALID_VALUE = 2, 15 CTC_STATUS_EXECUTION_FAILED = 3, 16 CTC_STATUS_UNKNOWN_ERROR = 4 17 } 18 19 /** Returns a single integer which specifies the API version of the warpctc library */ 20 int get_warpctc_version(); 21 22 /** 23 Returns a string containing a description of status that was passed in 24 25 Params: 26 status = identifies which string should be returned 27 28 Returns: C style string containing the text description 29 */ 30 const(char)* ctcGetStatusString(ctcStatus_t status); 31 32 alias ctcComputeLocation = int; 33 enum : ctcComputeLocation 34 { 35 CTC_CPU = 0, 36 CTC_GPU = 1 37 } 38 39 /** 40 Structure used for options to the CTC compution. 41 42 Applications should zero out the array using memset and sizeof(struct 43 ctcOptions) in C or default initialization (e.g. 'ctcOptions 44 options{};' or 'auto options = ctcOptions{}') in C++ to ensure 45 forward compatibility with added options. */ 46 struct ctcOptions 47 { 48 /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU} 49 ctcComputeLocation loc; 50 union 51 { 52 /// used when loc == CTC_CPU, the maximum number of threads that can be used 53 uint num_threads; 54 55 /// used when loc == CTC_GPU, which stream the kernels should be launched in 56 CUstream stream; 57 }; 58 59 /// the label value/index that the CTC calculation should use as the blank label 60 int blank_label; 61 }; 62 63 64 65 /** Compute the connectionist temporal classification loss between a sequence 66 * of probabilities and a ground truth labeling. Optionally compute the 67 * gradient with respect to the inputs. 68 * \param [in] activations pointer to the activations in either CPU or GPU 69 * addressable memory, depending on info. We assume a fixed 70 * memory layout for this 3 dimensional tensor, which has dimension 71 * (t, n, p), where t is the time index, n is the minibatch index, 72 * and p indexes over probabilities of each symbol in the alphabet. 73 * The memory layout is (t, n, p) in C order (slowest to fastest changing 74 * index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest 75 * changing index, aka column-major). We also assume strides are equal to 76 * dimensions - there is no padding between dimensions. 77 * More precisely, element (t, n, p), for a problem with mini_batch examples 78 * in the mini batch, and alphabet_size symbols in the alphabet, is located at: 79 * activations[(t * mini_batch + n) * alphabet_size + p] 80 * \param [out] gradients if not NULL, then gradients are computed. Should be 81 * allocated in the same memory space as probs and memory 82 * ordering is identical. 83 * \param [in] flat_labels Always in CPU memory. A concatenation 84 * of all the labels for the minibatch. 85 * \param [in] label_lengths Always in CPU memory. The length of each label 86 * for each example in the minibatch. 87 * \param [in] input_lengths Always in CPU memory. The number of time steps 88 * for each sequence in the minibatch. 89 * \param [in] alphabet_size The number of possible output symbols. There 90 * should be this many probabilities for each time step. 91 * \param [in] mini_batch How many examples in a minibatch. 92 * \param [out] costs Always in CPU memory. The cost of each example in the 93 * minibatch. 94 * \param [in,out] workspace In same memory space as probs. Should be of 95 * size requested by get_workspace_size. 96 * \param [in] options see struct ctcOptions 97 * 98 * \return Status information 99 * 100 * */ 101 ctcStatus_t compute_ctc_loss(const(float)* activations, 102 float* gradients, 103 const(int)* flat_labels, 104 const(int)* label_lengths, 105 const(int)* input_lengths, 106 int alphabet_size, 107 int minibatch, 108 float *costs, 109 void *workspace, 110 ctcOptions options); 111 112 113 /** For a given set of labels and minibatch size return the required workspace 114 * size. This will need to be allocated in the same memory space as your 115 * probabilities. 116 * \param [in] label_lengths Always in CPU memory. The length of each label 117 * for each example in the minibatch. 118 * \param [in] input_lengths Always in CPU memory. The number of time steps 119 * for each sequence in the minibatch. 120 * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, 121 * the number of probabilities at each time step 122 * \param [in] mini_batch How many examples in a minibatch. 123 * \param [in] info see struct ctcOptions 124 * \param [out] size_bytes is pointer to a scalar where the memory 125 * requirement in bytes will be placed. This memory should be allocated 126 * at the same place, CPU or GPU, that the probs are in 127 * 128 * \return Status information 129 **/ 130 ctcStatus_t get_workspace_size(const(int)* label_lengths, 131 const(int)* input_lengths, 132 int alphabet_size, int minibatch, 133 ctcOptions info, 134 size_t* size_bytes); 135 136 137 // Numerically stable softmax for a minibatch of 1 138 private void softmax(const(float)* acts, int alphabet_size, int T, float *probs) 139 { 140 import std.math; 141 for (int t = 0; t < T; ++t) 142 { 143 float max_activation = -float.infinity; 144 for (int a = 0; a < alphabet_size; ++a) 145 max_activation = fmax(max_activation, acts[t*alphabet_size + a]); 146 147 float denom = 0; 148 for (int a = 0; a < alphabet_size; ++a) 149 denom += exp(acts[t*alphabet_size + a] - max_activation); 150 151 for (int a = 0; a < alphabet_size; ++a) 152 probs[t*alphabet_size + a] = exp(acts[t*alphabet_size + a] - max_activation) / denom; 153 } 154 } 155 156 157 void throw_on_error(ctcStatus_t status, string message) 158 { 159 import std.format : format; 160 import std..string : fromStringz; 161 162 assert(status == CTC_STATUS_SUCCESS, 163 format!"%s, stat = %s"(message, ctcGetStatusString(status).fromStringz)); 164 } 165 166 167 unittest 168 { 169 import std.stdio; 170 import std.math; 171 import core.stdc.stdlib; 172 assert(get_warpctc_version() == 2); 173 // https://github.com/baidu-research/warp-ctc/blob/master/tests/test_cpu.cpp 174 175 const int alphabet_size = 5; 176 const int T = 2; 177 178 float[] activations = [0.1, 0.6, 0.1, 0.1, 0.1, 179 0.1, 0.1, 0.6, 0.1, 0.1]; 180 181 // Calculate the score analytically 182 float expected_score; 183 { 184 auto probs = new float[activations.length]; 185 softmax(activations.ptr, alphabet_size, T, probs.ptr); 186 187 // Score calculation is specific to the given activations above 188 expected_score = probs[1] * probs[7]; 189 } 190 191 int[] labels = [1, 2]; 192 int[] label_lengths = [2]; 193 int[] lengths = [T]; 194 195 float score; 196 ctcOptions options; 197 options.loc = CTC_CPU; 198 options.num_threads = 1; 199 200 size_t cpu_alloc_bytes; 201 throw_on_error(get_workspace_size(label_lengths.ptr, lengths.ptr, 202 alphabet_size, cast(int) lengths.length, options, 203 &cpu_alloc_bytes), 204 "Error: get_workspace_size in small_test"); 205 206 void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); 207 208 throw_on_error(compute_ctc_loss(activations.ptr, null, 209 labels.ptr, label_lengths.ptr, 210 lengths.ptr, 211 alphabet_size, 212 cast(int) lengths.length, 213 &score, 214 ctc_cpu_workspace, 215 options), 216 "Error: compute_ctc_loss in small_test"); 217 218 free(ctc_cpu_workspace); 219 score = exp(-score); 220 const float eps = 1e-6; 221 222 const float lb = expected_score - eps; 223 const float ub = expected_score + eps; 224 assert(score > lb && score < ub); 225 } 226 227 version (grain_cuda) unittest 228 { 229 import derelict.cuda; 230 import grain.cuda; 231 import std.math; 232 233 const int alphabet_size = 5; 234 const int T = 2; 235 236 float[] activations = [0.1, 0.6, 0.1, 0.1, 0.1, 237 0.1, 0.1, 0.6, 0.1, 0.1]; 238 239 // Calculate the score analytically 240 float expected_score; 241 { 242 auto probs = new float[activations.length]; 243 softmax(activations.ptr, alphabet_size, T, probs.ptr); 244 245 // Score calculation is specific to the given activations above 246 expected_score = probs[1] * probs[7]; 247 } 248 249 CUstream stream; 250 cuStreamCreate(cast(void**) &stream, CU_STREAM_DEFAULT); 251 scope (exit) cuStreamDestroy(stream); 252 // throw_on_error(cudaStreamCreate(&stream), 253 // "cudaStreamCreate"); 254 255 // float *activations_gpu; 256 auto activations_gpu = CuPtr!float(activations); 257 // throw_on_error(cudaMalloc(cast(void**) &activations_gpu, 258 // activations.length * float.sizeof), 259 // "cudaMalloc"); 260 // throw_on_error(cudaMemcpyAsync(activations_gpu, activations.ptr, 261 // activations.length * float.sizeof, 262 // cudaMemcpyHostToDevice, stream), 263 // "cudaMemcpyAsync"); 264 265 int[] labels = [1, 2]; 266 int[] label_lengths = [2]; 267 int[] lengths = [T]; 268 269 float score; 270 271 ctcOptions options; 272 options.loc = CTC_GPU; 273 options.stream = cast(CUstream) stream; 274 275 size_t gpu_alloc_bytes; 276 throw_on_error(get_workspace_size(label_lengths.ptr, lengths.ptr, 277 alphabet_size, cast(int) lengths.length, options, 278 &gpu_alloc_bytes), 279 "Error: get_workspace_size in small_test"); 280 281 // char *ctc_gpu_workspace; 282 auto ctc_gpu_workspace = CuPtr!char(gpu_alloc_bytes); 283 // throw_on_error(cudaMalloc(cast(void**) &ctc_gpu_workspace, gpu_alloc_bytes), 284 // "cudaMalloc"); 285 286 throw_on_error(compute_ctc_loss(activations_gpu.data, null, 287 labels.ptr, label_lengths.ptr, 288 lengths.ptr, 289 alphabet_size, 290 cast(int) lengths.length, 291 &score, 292 ctc_gpu_workspace.data, 293 options), 294 "Error: compute_ctc_loss in small_test"); 295 296 score = exp(-score); 297 const float eps = 1e-6; 298 299 const float lb = expected_score - eps; 300 const float ub = expected_score + eps; 301 302 // throw_on_error(cudaFree(activations_gpu), 303 // "cudaFree"); 304 // throw_on_error(cudaFree(ctc_gpu_workspace), 305 // "cudaFree"); 306 // throw_on_error(cudaStreamDestroy(stream), 307 // "cudaStreamDestroy"); 308 309 assert(score > lb && score < ub); 310 }