1
#if defined(_WIN32) || defined(_WIN64)
4
typedef UINT16 uint16_t;
5
typedef UINT32 uint32_t;
17
#include <cuda_runtime.h>
7
#include "normxcorr_hw.h"
26
//#include "normxcorr_hw_pack.h"
27
10
#include "normxcorr_hw_msg.h"
29
11
#include "normxcorr_hw_kernel.cu"
35
ACTION_COMPUTE_BASE = 10,
36
ACTION_COMPUTE_FRAGMENT = 11,
41
ERROR_CUDA_MALLOC = 2,
45
struct STProcessingState {
46
cufftComplex *cuda_base_buffer; // Stored FFT's of the template image
47
cufftComplex *cuda_data_buffer; // Main computational buffer
48
cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs
49
cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs
50
float *cuda_final_buffer; // Ultimate output
51
uint8_t *cuda_input_buffer; // Input buffer
53
float *cuda_lsum_buffer;
54
float *cuda_denom_buffer;
57
uint16_t *cuda_nonzero_items;
58
uint16_t *cuda_nonzero_buffer;
60
int ncp; // Number of control points
61
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
62
int fft_size2; // size * size
63
int fft_alloc_size; // cuda optimized size2
64
int fft_inner_size; // size * (size/2 + 1), R2C/C2R
66
int fft_initialized; // Flag indicating if CUFFT plan is initialized
67
cufftHandle cufft_plan;
68
cufftHandle cufft_r2c_plan;
69
cufftHandle cufft_c2r_plan;
72
typedef struct STProcessingState TProcessingState;
73
14
static TProcessingState *pstate = NULL;
76
16
static void fftFree(TProcessingState *ps) {
77
if (ps->fft_initialized) {
78
cufftDestroy(ps->cufft_r2c_plan);
79
cufftDestroy(ps->cufft_c2r_plan);
80
// cufftDestroy(ps->cufft_plan);
82
ps->fft_initialized = false;
85
17
if (ps->cuda_base_buffer) {
86
19
free(ps->grid_size);
87
20
free(ps->cuda_nonzero_items);
23
cudaFree(ps->cuda_lsum_temp);
89
25
cudaFree(ps->cuda_lsum_buffer);
90
26
cudaFree(ps->cuda_denom_buffer);
91
28
cudaFree(ps->cuda_nonzero_buffer);
93
31
cudaFree(ps->cuda_temp_buffer);
94
32
cudaFree(ps->cuda_final_buffer);
100
38
ps->cuda_base_buffer = NULL;
41
// DS: Source of bug, that occasionaly can corrupt something ...
42
if (ps->cudpp_initialized) {
43
cudppDestroyPlan(ps->cudpp_plan);
44
ps->cudpp_initialized = false;
47
if (ps->fft_initialized) {
48
cufftDestroy(ps->cufft_r2c_plan);
49
cufftDestroy(ps->cufft_c2r_plan);
50
// cufftDestroy(ps->cufft_plan);
52
ps->fft_initialized = false;
104
57
#include <unistd.h>
105
58
static int fftInit(TProcessingState *ps) {
59
CUDPPConfiguration cudpp_config;
61
CUDPPResult cudpp_err;
106
62
cufftResult cufft_err;
107
63
cudaError cuda_err;
65
int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
127
85
ps->fft_initialized = true;
87
cudpp_config.algorithm = CUDPP_SCAN;
88
cudpp_config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;
89
cudpp_config.op = CUDPP_ADD;
90
cudpp_config.datatype = CUDPP_FLOAT;
92
cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
93
if (cudpp_err != CUDPP_SUCCESS) {
94
reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
99
ps->cudpp_initialized = true;
129
101
cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
131
103
reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_base_buffer is failed", ps->ncp, ps->fft_alloc_size);
171
143
cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));
173
cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
145
cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
175
147
reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_buffer is failed", ps->ncp, ps->fft_alloc_size);
177
149
return ERROR_CUDA_MALLOC;
180
cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
152
cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
182
154
reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_buffer is failed", ps->ncp, ps->fft_alloc_size);
184
156
return ERROR_CUDA_MALLOC;
187
160
cuda_err = cudaMalloc((void**)&ps->cuda_nonzero_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));
189
162
reportError("Device memory allocation of %u*%u*uint16 bytes for cuda_nonzero_buffer is failed", ps->ncp, ps->fft_alloc_size);
191
164
return ERROR_CUDA_MALLOC;
193
166
cudaMemset((void*)ps->cuda_nonzero_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));
195
168
ps->cuda_nonzero_items = (uint16_t*)malloc(ps->ncp * sizeof(uint16_t));
196
169
if (!ps->cuda_nonzero_items) {
197
170
reportError("Host memory allocation of %u*uint16 bytes for cuda_nonzero_items is failed", ps->ncp);
206
179
return ERROR_MALLOC;
183
cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2 * sizeof(float));
185
reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);
189
// We need to zero temporary buffers as well, since we are not computing
190
// cumsum of complete matrix, but non-zero part of it
191
cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));
234
219
ps->cuda_base_buffer = NULL;
235
220
ps->fft_initialized = false;
221
ps->cudpp_initialized = false;
240
static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data, const mxArray *lsum, const mxArray *denom, const mxArray *nonzero) {
228
static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
241
229
uint8_t *dataPtr;
243
231
if (!ps->fft_initialized) {
248
236
int size = ps->fft_size;
237
int alloc_size = ps->fft_alloc_size;
239
int N = mxGetM(data);
242
dim3 input_block_dim(N, 1, 1);
243
dim3 input_grid_dim(N, 1, 1);
245
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
246
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
247
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
249
dataPtr = (uint8_t*)mxGetData(data);
250
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
249
253
int size2 = size*size;
250
int alloc_size = ps->fft_alloc_size;
252
int N = mxGetM(data);
255
dim3 input_block_dim(N, 1, 1);
256
dim3 input_grid_dim(N, 1, 1);
258
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
259
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
260
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
262
dataPtr = (uint8_t*)mxGetData(data);
263
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
255
This is a memory copy based code to fill the lsum and denom buffers
264
256
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);
266
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
268
258
// Loading various stuff
269
259
cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);
270
260
cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);
274
264
ps->cuda_nonzero_items[icp] = N;
276
if (N%BLOCK_SIZE) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE);
277
else ps->grid_size[icp] = N / BLOCK_SIZE;
266
if (N%BLOCK_SIZE_1D) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE_1D);
267
else ps->grid_size[icp] = N / BLOCK_SIZE_1D;
279
269
cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);
273
float *lsum_temp = ps->cuda_lsum_temp;
274
int step = ps->lsum_alloc_size * ps->lsum_alloc_size;
276
cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
277
cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));
279
vecPackBase<<<input_grid_dim, input_block_dim>>>(
282
lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
285
/* In general we should expect non-zero denominals, therefore the Nonzero array is not computed */
287
ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,
288
lsum_temp + (2 * step), lsum_temp + (3 * step),
289
lsum_temp, lsum_temp + step);
291
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
299
311
dim3 block_dim(size / 2 + 1, 1, 1);
300
312
dim3 grid_dim(size, 1, 1);
314
dim3 output_block_dim(size, 1, 1);
315
dim3 output_grid_dim(size, 1, 1);
302
317
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
303
318
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
304
319
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
316
331
cudaRealPtr = ps->cuda_result_buffer;
317
332
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);
319
uint16_t nz_items = ps->cuda_nonzero_items[icp];
320
uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;
322
334
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
323
335
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
327
// printf("%i (%i %i)\n", nz_items, icp, ps->ncp);
330
grids = ps->grid_size[icp];
339
int grids = ps->grid_size[icp];
340
uint16_t nz_items = ps->cuda_nonzero_items[icp];
341
uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;
332
dim3 output_block_dim(BLOCK_SIZE, 1, 1);
343
dim3 output_block_dim(BLOCK_SIZE_1D, 1, 1);
333
344
dim3 output_grid_dim(grids, 1, 1);
335
347
vecCompute<<<output_grid_dim, output_block_dim>>>(
336
348
nz, cudaResultPtr,
337
349
cudaRealPtr, 1./(size2 * (N2 - 1)),
356
vecCompute<<<output_grid_dim, output_block_dim>>>(
358
cudaRealPtr, 1./(size2 * (N2 - 1)),
359
cudaLSum, sum / (N2 * (N2 - 1)),
343
364
res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
344
365
ar = mxGetPr(res);
410
433
// Initialising, for now a single client is supported only
411
435
idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);
413
437
reportError("Initialization is failed");
417
442
// Detecting cuda devices
418
444
cudaGetDeviceCount(&deviceCount);
419
445
if (deviceCount) {
420
446
cudaGetDeviceProperties(&deviceProp, 0);
527
556
plhs[0] = fftCompute(ps, icp, input, input_sum, input_denom);
529
558
case ACTION_COMPUTE_BASE:
531
reportError("This action expects 5 arguments, but %i is passed", nrhs - 2);
562
#endif /* VALIDATE_LSUM */
564
reportError("This action expects 2 arguments, but %i is passed", nrhs - 2);
557
586
iprop = ps->fft_size;
559
(mxGetNumberOfDimensions(lsum) != 2)||
560
(mxGetNumberOfDimensions(denom) != 2)||
561
(mxGetClassID(lsum) != mxSINGLE_CLASS)||
562
(mxGetClassID(denom) != mxSINGLE_CLASS)||
563
(mxGetClassID(nonzero) != mxUINT16_CLASS)||
564
(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
565
(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
594
(mxGetNumberOfDimensions(lsum) != 2)||
595
(mxGetNumberOfDimensions(denom) != 2)||
596
(mxGetClassID(lsum) != mxSINGLE_CLASS)||
597
(mxGetClassID(denom) != mxSINGLE_CLASS)||
598
(mxGetClassID(nonzero) != mxUINT16_CLASS)||
599
(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
600
(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
568
reportError("Invalid properties for base initialization are specified");
603
reportError("Invalid properties for base initialization are specified");
572
fftUploadBaseData(ps, icp, base, lsum, denom, nonzero);
611
#endif /* VALIDATE_LSUM */
613
fftUploadBaseData(ps, icp, base);
616
local_sum_validate(ps, icp, lsum, denom);
617
#endif /* VALIDATE_LSUM */
574
620
case ACTION_SETUP:
581
627
ps->ncp = iprop + 1;
583
629
iprop = (int)mxGetScalar(prhs[3]);
630
ps->corr_size = iprop;
584
631
ps->fft_size = 6 * iprop + 1;
585
632
ps->fft_size2 = ps->fft_size * ps->fft_size;
586
633
ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);
588
if (ps->fft_size2 % BLOCK_SIZE) {
589
ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE) + 1) * BLOCK_SIZE;
635
if (ps->fft_size2 % BLOCK_SIZE_1D) {
636
ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE_1D) + 1) * BLOCK_SIZE_1D;
591
638
ps->fft_alloc_size = ps->fft_size2;
641
ps->subimage_size = ps->corr_size * 4 + 1;
642
ps->lsum_size = ps->corr_size * 2 + 1;
644
ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
646
if (ps->fft_size % BLOCK_SIZE_2D) {
647
ps->lsum_short_aligned_size = ((ps->fft_size / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
649
ps->lsum_short_aligned_size = ps->fft_size;
652
if ((ps->lsum_temp_size) % BLOCK_SIZE_2D) {
653
ps->lsum_aligned_size = (((ps->lsum_temp_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
655
ps->lsum_aligned_size = ps->lsum_temp_size;
658
if ((ps->lsum_temp_size + ps->lsum_size) % BLOCK_SIZE_2D) {
659
ps->lsum_alloc_size = (((ps->lsum_temp_size + ps->lsum_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
661
ps->lsum_alloc_size = ps->lsum_temp_size + ps->lsum_size;
594
664
err = fftInit(ps);