5
#include <cuda_runtime.h>
14
//#include "normxcorr_hw_pack.h"
15
#include "normxcorr_hw_msg.h"
17
#include "normxcorr_hw_kernel.cu"
23
ACTION_COMPUTE_BASE = 10,
24
ACTION_COMPUTE_FRAGMENT = 11,
28
struct STProcessingState {
29
cufftComplex *cuda_base_buffer; // Stored FFT's of the template image
30
cufftComplex *cuda_data_buffer; // Main computational buffer
31
cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs
32
cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs
33
float *cuda_final_buffer; // Ultimate output
34
uint8_t *cuda_input_buffer; // Input buffer
36
float *cuda_lsum_buffer;
37
float *cuda_denom_buffer;
40
uint16_t *cuda_nonzero_items;
41
uint16_t *cuda_nonzero_buffer;
43
int ncp; // Number of control points
44
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
45
int fft_size2; // size * size
46
int fft_alloc_size; // cuda optimized size2
47
int fft_inner_size; // size * (size/2 + 1), R2C/C2R
49
int fft_initialized; // Flag indicating if CUFFT plan is initialized
50
cufftHandle cufft_plan;
51
cufftHandle cufft_r2c_plan;
52
cufftHandle cufft_c2r_plan;
55
typedef struct STProcessingState TProcessingState;
56
static TProcessingState *pstate = NULL;
59
static void fftFree(TProcessingState *ps) {
60
if (ps->fft_initialized) {
61
cufftDestroy(ps->cufft_r2c_plan);
62
cufftDestroy(ps->cufft_c2r_plan);
63
// cufftDestroy(ps->cufft_plan);
65
ps->fft_initialized = false;
68
if (ps->cuda_base_buffer) {
70
free(ps->cuda_nonzero_items);
72
cudaFree(ps->cuda_lsum_buffer);
73
cudaFree(ps->cuda_denom_buffer);
74
cudaFree(ps->cuda_nonzero_buffer);
76
cudaFree(ps->cuda_temp_buffer);
77
cudaFree(ps->cuda_final_buffer);
78
cudaFree(ps->cuda_result_buffer);
79
cudaFree(ps->cuda_data_buffer);
80
cudaFree(ps->cuda_base_buffer);
81
cudaFree(ps->cuda_input_buffer);
83
ps->cuda_base_buffer = NULL;
87
static void fftInit(TProcessingState *ps) {
91
// cufftPlan2d(&ps->cufft_plan, ps->fft_size, ps->fft_size, CUFFT_C2C);
93
cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);
94
cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_size, ps->fft_size, CUFFT_C2R);
96
ps->fft_initialized = true;
98
cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
100
cudaMalloc((void**)&ps->cuda_data_buffer, ps->fft_alloc_size * sizeof(cufftComplex));
101
cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t));
102
cudaMalloc((void**)&ps->cuda_final_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
103
cudaMemset((void*)ps->cuda_final_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(float));
104
cudaMalloc((void**)&ps->cuda_result_buffer, ps->fft_alloc_size * sizeof(cufftReal));
105
cudaMalloc((void**)&ps->cuda_temp_buffer, ps->fft_alloc_size * sizeof(cufftReal));
106
cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));
108
cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
109
cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
110
cudaMalloc((void**)&ps->cuda_nonzero_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));
111
cudaMemset((void*)ps->cuda_nonzero_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));
113
ps->cuda_nonzero_items = (uint16_t*)malloc(ps->ncp * sizeof(uint16_t));
115
ps->grid_size = (int*)malloc(ps->ncp*sizeof(int));
118
static void fftPrepare(TProcessingState *ps) {
119
if (ps->fft_initialized) {
120
// Since template and current image have different neighbourhoud sizes
121
cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));
127
void pstateFree(TProcessingState *ps) {
134
TProcessingState *pstateInit() {
135
TProcessingState *ps;
137
ps = (TProcessingState*)malloc(sizeof(TProcessingState));
140
ps->cuda_base_buffer = NULL;
141
ps->fft_initialized = false;
146
static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data, const mxArray *lsum, const mxArray *denom, const mxArray *nonzero) {
149
if (!ps->fft_initialized) {
150
reportError("cuFFT engine is not initialized yet");
154
int size = ps->fft_size;
155
int size2 = size*size;
156
int alloc_size = ps->fft_alloc_size;
158
int N = mxGetM(data);
161
dim3 input_block_dim(N, 1, 1);
162
dim3 input_grid_dim(N, 1, 1);
164
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
165
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
166
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
168
dataPtr = (uint8_t*)mxGetData(data);
169
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
170
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);
171
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
173
// Loading various stuff
174
cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);
175
cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);
179
ps->cuda_nonzero_items[icp] = N;
181
if (N%BLOCK_SIZE) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE);
182
else ps->grid_size[icp] = N / BLOCK_SIZE;
184
cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);
189
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *data, float sum, float denom) {
194
int size = ps->fft_size;
195
int size2 = size * size;
196
int alloc_size = ps->fft_alloc_size;
198
int N = mxGetM(data);
201
dim3 input_block_dim(N, 1, 1);
202
dim3 input_grid_dim(N, 1, 1);
204
dim3 block_dim(size / 2 + 1, 1, 1);
205
dim3 grid_dim(size, 1, 1);
207
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
208
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
209
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
210
cufftComplex *cudaDataPtr = ps->cuda_data_buffer;
211
float *cudaResultPtr = ps->cuda_final_buffer;
213
dataPtr = (uint8_t*)mxGetData(data);
214
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
215
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);
216
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);
218
vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaPtr, size/2+1);
220
cudaRealPtr = ps->cuda_result_buffer;
221
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);
223
uint16_t nz_items = ps->cuda_nonzero_items[icp];
224
uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;
226
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
227
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
231
// printf("%i (%i %i)\n", nz_items, icp, ps->ncp);
234
grids = ps->grid_size[icp];
236
dim3 output_block_dim(BLOCK_SIZE, 1, 1);
237
dim3 output_grid_dim(grids, 1, 1);
239
vecCompute<<<output_grid_dim, output_block_dim>>>(
241
cudaRealPtr, 1./(size2 * (N2 - 1)),
242
cudaLSum, sum / (N2 * (N2 - 1)),
247
res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
250
cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
257
static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {
258
cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);
263
static void selfClean() {
264
reportMessage("Unloading normxcorr_hw");
273
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
279
TProcessingState *ps;
285
const mxArray *input;
288
const mxArray *denom;
289
const mxArray *nonzero;
297
reportMessage("Initializing normxcorr_hw instance");
300
reportError("You should accept a single result from initialization call");
305
reportError("Only a single calculation process is supported at the moment");
310
// Initialising, for now a single client is supported only
311
idMatrix = mxCreateNumericMatrix(1, 1, mxUINT32_CLASS, mxREAL);
313
reportError("Initialization is failed");
317
pstate = pstateInit();
319
mxDestroyArray(idMatrix);
320
reportError("State structure initialization is failed");
326
idPtr = (uint32_t*)mxGetData(idMatrix);
331
mexAtExit(selfClean);
335
/* idMatrix = (mxArray*)prhs[0];
336
if ((mxGetClassID(idMatrix) != mxUINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {
337
reportError("Invalid parameter is supplied in place of process identificator");
341
idPtr = (uint32_t*)mxGetData(idMatrix);
343
reportError("Mex is not able to obtain process identificator");
349
reportError("Invalid process identificator is supplied");
354
reportError("The interface is not initialized");
361
reportMessage("Cleaning normxcorr_hw instance");
370
action = (TAction)mxGetScalar((mxArray*)prhs[1]);
372
// reportMessage("Executing normxcorr_hw action: %u", action);
375
case ACTION_COMPUTE_FRAGMENT:
378
reportError("This action expects 4 arguments, but %i is passed", nrhs - 2);
383
reportError("This action expects single ouput, but %i is passed", nlhs);
387
icp = (unsigned int)mxGetScalar(prhs[2]);
388
/* if (icp >= ps->ncp) {
389
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
395
if (mxGetNumberOfDimensions(input) != 2) {
396
reportError("Invalid dimensionality of input matrix, 2D matrix is expected");
400
if (mxGetClassID(input) != mxUINT8_CLASS) {
401
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(input));
406
input_sum = mxGetScalar(prhs[4]);
407
input_denom = mxGetScalar(prhs[5]);
409
plhs[0] = fftCompute(ps, icp, input, input_sum, input_denom);
411
case ACTION_COMPUTE_BASE:
413
reportError("This action expects 5 arguments, but %i is passed", nrhs - 2);
417
icp = (unsigned int)mxGetScalar(prhs[2]);
418
if (icp >= ps->ncp) {
419
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
425
if (mxGetNumberOfDimensions(base) != 2) {
426
reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
430
if (mxGetClassID(base) != mxUINT8_CLASS) {
431
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
439
iprop = ps->fft_size;
441
(mxGetNumberOfDimensions(lsum) != 2)||
442
(mxGetNumberOfDimensions(denom) != 2)||
443
(mxGetClassID(lsum) != mxSINGLE_CLASS)||
444
(mxGetClassID(denom) != mxSINGLE_CLASS)||
445
(mxGetClassID(nonzero) != mxUINT16_CLASS)||
446
(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
447
(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
450
reportError("Invalid properties for base initialization are specified");
454
fftUploadBaseData(ps, icp, base, lsum, denom, nonzero);
458
reportError("SETUP action expects 'ncp' and 'corrsize' parameters");
462
iprop = (int)mxGetScalar(prhs[2]);
465
iprop = (int)mxGetScalar(prhs[3]);
466
ps->fft_size = 6 * iprop + 1;
467
ps->fft_size2 = ps->fft_size * ps->fft_size;
468
ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);
470
if (ps->fft_size2 % BLOCK_SIZE) {
471
ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE) + 1) * BLOCK_SIZE;
473
ps->fft_alloc_size = ps->fft_size2;
482
reportError("Unknown request %i", action);