1 module grain.warpctc;
2 
3 extern (C):
4 
5 //forward declare of CUDA typedef to avoid needing to pull in CUDA headers
6 struct CUstream_st;
7 alias CUstream = CUstream_st*;
8 
9 alias ctcStatus_t = int;
10 enum : ctcStatus_t
11 {
12     CTC_STATUS_SUCCESS = 0,
13     CTC_STATUS_MEMOPS_FAILED = 1,
14     CTC_STATUS_INVALID_VALUE = 2,
15     CTC_STATUS_EXECUTION_FAILED = 3,
16     CTC_STATUS_UNKNOWN_ERROR = 4
17 }
18 
19 /** Returns a single integer which specifies the API version of the warpctc library */
20 int get_warpctc_version();
21 
22 /**
23 Returns a string containing a description of status that was passed in
24 
25 Params:
26     status = identifies which string should be returned
27 
28 Returns: C style string containing the text description
29  */
30 const(char)* ctcGetStatusString(ctcStatus_t status);
31 
32 alias ctcComputeLocation = int;
33 enum : ctcComputeLocation
34 {
35     CTC_CPU = 0,
36     CTC_GPU = 1
37 }
38 
39 /**
40 Structure used for options to the CTC compution.
41 
42 Applications should zero out the array using memset and sizeof(struct
43  ctcOptions) in C or default initialization (e.g. 'ctcOptions
44 options{};' or 'auto options = ctcOptions{}') in C++ to ensure
45 forward compatibility with added options. */
46 struct ctcOptions
47 {
48     /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
49     ctcComputeLocation loc;
50     union
51     {
52         /// used when loc == CTC_CPU, the maximum number of threads that can be used
53         uint num_threads;
54 
55         /// used when loc == CTC_GPU, which stream the kernels should be launched in
56         CUstream stream;
57     };
58 
59     /// the label value/index that the CTC calculation should use as the blank label
60     int blank_label;
61 };
62 
63 
64 
65 /** Compute the connectionist temporal classification loss between a sequence
66  *  of probabilities and a ground truth labeling.  Optionally compute the
67  *  gradient with respect to the inputs.
68  * \param [in] activations pointer to the activations in either CPU or GPU
69  *             addressable memory, depending on info.  We assume a fixed
70  *             memory layout for this 3 dimensional tensor, which has dimension
71  *             (t, n, p), where t is the time index, n is the minibatch index,
72  *             and p indexes over probabilities of each symbol in the alphabet.
73  *             The memory layout is (t, n, p) in C order (slowest to fastest changing
74  *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
75  *             changing index, aka column-major). We also assume strides are equal to
76  *             dimensions - there is no padding between dimensions.
77  *             More precisely, element (t, n, p), for a problem with mini_batch examples
78  *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
79  *             activations[(t * mini_batch + n) * alphabet_size + p]
80  * \param [out] gradients if not NULL, then gradients are computed.  Should be
81  *              allocated in the same memory space as probs and memory
82  *              ordering is identical.
83  * \param [in]  flat_labels Always in CPU memory.  A concatenation
84  *              of all the labels for the minibatch.
85  * \param [in]  label_lengths Always in CPU memory. The length of each label
86  *              for each example in the minibatch.
87  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
88  *              for each sequence in the minibatch.
89  * \param [in]  alphabet_size The number of possible output symbols.  There
90  *              should be this many probabilities for each time step.
91  * \param [in]  mini_batch How many examples in a minibatch.
92  * \param [out] costs Always in CPU memory.  The cost of each example in the
93  *              minibatch.
94  * \param [in,out] workspace In same memory space as probs. Should be of
95  *                 size requested by get_workspace_size.
96  * \param [in]  options see struct ctcOptions
97  *
98  *  \return Status information
99  *
100  * */
101 ctcStatus_t compute_ctc_loss(const(float)* activations,
102                              float* gradients,
103                              const(int)* flat_labels,
104                              const(int)* label_lengths,
105                              const(int)* input_lengths,
106                              int alphabet_size,
107                              int minibatch,
108                              float *costs,
109                              void *workspace,
110                              ctcOptions options);
111 
112 
113 /** For a given set of labels and minibatch size return the required workspace
114  *  size.  This will need to be allocated in the same memory space as your
115  *  probabilities.
116  * \param [in]  label_lengths Always in CPU memory. The length of each label
117  *              for each example in the minibatch.
118  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
119  *              for each sequence in the minibatch.
120  * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
121  *              the number of probabilities at each time step
122  * \param [in]  mini_batch How many examples in a minibatch.
123  * \param [in]  info see struct ctcOptions
124  * \param [out] size_bytes is pointer to a scalar where the memory
125  *              requirement in bytes will be placed. This memory should be allocated
126  *              at the same place, CPU or GPU, that the probs are in
127  *
128  *  \return Status information
129  **/
130 ctcStatus_t get_workspace_size(const(int)* label_lengths,
131                                const(int)* input_lengths,
132                                int alphabet_size, int minibatch,
133                                ctcOptions info,
134                                size_t* size_bytes);
135 
136 
137 // Numerically stable softmax for a minibatch of 1
138 private void softmax(const(float)* acts, int alphabet_size, int T, float *probs)
139 {
140     import std.math;
141     for (int t = 0; t < T; ++t)
142     {
143         float max_activation = -float.infinity;
144         for (int a = 0; a < alphabet_size; ++a)
145             max_activation = fmax(max_activation, acts[t*alphabet_size + a]);
146 
147         float denom = 0;
148         for (int a = 0; a < alphabet_size; ++a)
149             denom += exp(acts[t*alphabet_size + a] - max_activation);
150 
151         for (int a = 0; a < alphabet_size; ++a)
152             probs[t*alphabet_size + a] = exp(acts[t*alphabet_size + a] - max_activation) / denom;
153     }
154 }
155 
156 
157 void throw_on_error(ctcStatus_t status, string message)
158 {
159     import std.format : format;
160     import std..string : fromStringz;
161 
162     assert(status == CTC_STATUS_SUCCESS,
163            format!"%s, stat = %s"(message, ctcGetStatusString(status).fromStringz));
164 }
165 
166 
167 unittest
168 {
169     import std.stdio;
170     import std.math;
171     import core.stdc.stdlib;
172     assert(get_warpctc_version() == 2);
173     // https://github.com/baidu-research/warp-ctc/blob/master/tests/test_cpu.cpp
174 
175     const int alphabet_size = 5;
176     const int T = 2;
177 
178     float[] activations = [0.1, 0.6, 0.1, 0.1, 0.1,
179                            0.1, 0.1, 0.6, 0.1, 0.1];
180 
181     // Calculate the score analytically
182     float expected_score;
183     {
184         auto probs = new float[activations.length];
185         softmax(activations.ptr, alphabet_size, T, probs.ptr);
186 
187         // Score calculation is specific to the given activations above
188         expected_score = probs[1] * probs[7];
189     }
190 
191     int[] labels = [1, 2];
192     int[] label_lengths = [2];
193     int[] lengths = [T];
194 
195     float score;
196     ctcOptions options;
197     options.loc = CTC_CPU;
198     options.num_threads = 1;
199 
200     size_t cpu_alloc_bytes;
201     throw_on_error(get_workspace_size(label_lengths.ptr, lengths.ptr,
202                                       alphabet_size, cast(int) lengths.length, options,
203                                       &cpu_alloc_bytes),
204                    "Error: get_workspace_size in small_test");
205 
206     void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
207 
208     throw_on_error(compute_ctc_loss(activations.ptr, null,
209                                     labels.ptr, label_lengths.ptr,
210                                     lengths.ptr,
211                                     alphabet_size,
212                                     cast(int) lengths.length,
213                                     &score,
214                                     ctc_cpu_workspace,
215                                     options),
216                    "Error: compute_ctc_loss in small_test");
217 
218     free(ctc_cpu_workspace);
219     score = exp(-score);
220     const float eps = 1e-6;
221 
222     const float lb = expected_score - eps;
223     const float ub = expected_score + eps;
224     assert(score > lb && score < ub);
225 }
226 
227 version (grain_cuda) unittest
228 {
229     import derelict.cuda;
230     import grain.cuda;
231     import std.math;
232 
233     const int alphabet_size = 5;
234     const int T = 2;
235 
236     float[] activations = [0.1, 0.6, 0.1, 0.1, 0.1,
237                            0.1, 0.1, 0.6, 0.1, 0.1];
238 
239     // Calculate the score analytically
240     float expected_score;
241     {
242         auto probs = new float[activations.length];
243         softmax(activations.ptr, alphabet_size, T, probs.ptr);
244 
245         // Score calculation is specific to the given activations above
246         expected_score = probs[1] * probs[7];
247     }
248 
249     CUstream stream;
250     cuStreamCreate(cast(void**) &stream, CU_STREAM_DEFAULT);
251     scope (exit) cuStreamDestroy(stream);
252     // throw_on_error(cudaStreamCreate(&stream),
253     //                "cudaStreamCreate");
254 
255     // float *activations_gpu;
256     auto activations_gpu = CuPtr!float(activations);
257     // throw_on_error(cudaMalloc(cast(void**) &activations_gpu,
258     //                activations.length * float.sizeof),
259     //                "cudaMalloc");
260     // throw_on_error(cudaMemcpyAsync(activations_gpu, activations.ptr,
261     //                                activations.length * float.sizeof,
262     //                                cudaMemcpyHostToDevice, stream),
263     //                "cudaMemcpyAsync");
264 
265     int[] labels = [1, 2];
266     int[] label_lengths = [2];
267     int[] lengths = [T];
268 
269     float score;
270 
271     ctcOptions options;
272     options.loc = CTC_GPU;
273     options.stream = cast(CUstream) stream;
274 
275     size_t gpu_alloc_bytes;
276     throw_on_error(get_workspace_size(label_lengths.ptr, lengths.ptr,
277                                       alphabet_size, cast(int) lengths.length, options,
278                                       &gpu_alloc_bytes),
279                    "Error: get_workspace_size in small_test");
280 
281     // char *ctc_gpu_workspace;
282     auto ctc_gpu_workspace = CuPtr!char(gpu_alloc_bytes);
283     // throw_on_error(cudaMalloc(cast(void**) &ctc_gpu_workspace, gpu_alloc_bytes),
284     //                "cudaMalloc");
285 
286     throw_on_error(compute_ctc_loss(activations_gpu.data, null,
287                                     labels.ptr, label_lengths.ptr,
288                                     lengths.ptr,
289                                     alphabet_size,
290                                     cast(int) lengths.length,
291                                     &score,
292                                     ctc_gpu_workspace.data,
293                                     options),
294                    "Error: compute_ctc_loss in small_test");
295 
296     score = exp(-score);
297     const float eps = 1e-6;
298 
299     const float lb = expected_score - eps;
300     const float ub = expected_score + eps;
301 
302     // throw_on_error(cudaFree(activations_gpu),
303     //                "cudaFree");
304     // throw_on_error(cudaFree(ctc_gpu_workspace),
305     //                "cudaFree");
306     // throw_on_error(cudaStreamDestroy(stream),
307     //                "cudaStreamDestroy");
308 
309     assert(score > lb && score < ub);
310 }