19
17
static TProcessingState *pstate = NULL;
21
19
static void fftFree(TProcessingState *ps) {
22
if (ps->cuda_base_buffer) {
20
if (ps->cuda_fft_buffer) {
21
if (ps->coords) mxDestroyArray(ps->coords);
23
if (ps->banlist) free(ps->banlist);
23
25
cudaFree(ps->cuda_lsum_temp);
25
27
cudaFree(ps->cuda_lsum_buffer);
26
28
cudaFree(ps->cuda_denom_buffer);
29
cudaFree(ps->cuda_fft_buffer);
28
cudaFree(ps->cuda_temp_buffer);
31
cudaFree(ps->cuda_data_buffer);
32
cudaFree(ps->cuda_base_buffer);
29
34
cudaFree(ps->cuda_final_buffer);
30
35
cudaFree(ps->cuda_result_buffer);
31
cudaFree(ps->cuda_data_buffer);
32
cudaFree(ps->cuda_base_buffer);
36
cudaFree(ps->cuda_temp_buffer);
33
37
cudaFree(ps->cuda_input_buffer);
34
38
cudaFreeHost(ps->input_buffer);
36
40
cudaFree(ps->cuda_cp);
38
cudaFreeHost(ps->data_x);
39
cudaFreeHost(ps->data_y);
41
ps->cuda_base_buffer = NULL;
42
cudaFreeHost(ps->points);
44
45
// DS: Source of bug, that occasionaly can corrupt something ...
45
46
if (ps->cudpp_initialized) {
46
47
cudppDestroyPlan(ps->cudpp_plan);
47
ps->cudpp_initialized = false;
50
50
if (ps->fft_initialized) {
51
51
cufftDestroy(ps->cufft_r2c_plan);
52
52
cufftDestroy(ps->cufft_c2r_plan);
53
// cufftDestroy(ps->cufft_plan);
55
ps->fft_initialized = false;
55
memset(ps, 0, sizeof(TProcessingState));
224
247
TProcessingState *ps;
226
249
ps = (TProcessingState*)malloc(sizeof(TProcessingState));
229
ps->cuda_base_buffer = NULL;
230
ps->fft_initialized = false;
231
ps->cudpp_initialized = false;
250
if (ps) memset(ps, 0, sizeof(TProcessingState));
238
static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
241
if (!ps->fft_initialized) {
242
reportError("cuFFT engine is not initialized yet");
246
int size = ps->fft_size;
254
static inline int fftCalibrate(TProcessingState *ps, const mxArray *image) {
255
int width = mxGetN(image);
256
int height = mxGetM(image);
258
int size = 2 * ps->corr_size + 1;
259
int size2 = size * size;
261
int base_size = 4 * ps->corr_size + 1;
262
int base_size2 = base_size * base_size;
264
// printf("%u %u %u\n", width*height, ps->ncp*size2, ps->ncp*base_size2);
266
if (width * height > ps->ncp * size2) {
272
// if not enoguh space for caching enable anyway ?
273
if (width * height > ps->ncp * base_size2) {
279
ps->maxx = width - 1;
281
ps->maxy = height - 1;
289
static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
290
int width = mxGetN(image);
291
int height = mxGetM(image);
293
int check_mode = ((ps->base_mode)&&(!ps->mode));
294
float minx, miny, maxx, maxy;
296
int precision = ps->precision;
298
int half_size = 2 * ps->corr_size;
299
int size = 2 * half_size + 1;
301
int fft_size = ps->fft_size;
303
int ncp_alloc = ps->ncp_alloc_size;
247
304
int alloc_size = ps->fft_alloc_size;
249
int N = mxGetM(data);
252
305
int side_alloc = ps->side_alloc_size;
253
306
int side_alloc2 = side_alloc * side_alloc;
255
dim3 input_block_dim(N, 1, 1);
256
dim3 input_grid_dim(N, 1, 1);
258
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;
259
cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
260
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
262
dataPtr = (uint8_t*)mxGetData(data);
263
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
308
uint8_t *banlist = ps->banlist + icp;
310
float *data_x = ps->points + icp;
311
float *data_y = data_x + ncp_alloc;
313
float *frac_x = ps->points + 4 * ncp_alloc + icp;
314
float *frac_y = frac_x + ncp_alloc;
316
uint8_t *fullimg = ((uint8_t*)mxGetData(image));
317
uint8_t *img = ps->input_buffer;
265
319
float *lsum_temp = ps->cuda_lsum_temp;
266
int step = ps->lsum_alloc_size * ps->lsum_alloc_size;
268
cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
269
cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));
271
vecPackBase<<<input_grid_dim, input_block_dim>>>(
274
lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
320
int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
322
dim3 input_block_dim(size, 1, 1);
323
dim3 input_grid_dim(size, 1, 1);
325
cufftReal *cudaRealPtr = ps->cuda_base_buffer;
334
for (int i = 0;i < ncp;i++) {
335
float x = data_x[i] - 1;
336
float y = data_y[i] - 1;
338
frac_x[i] = x - round(x * precision) / precision;
339
frac_y[i] = y - round(y * precision) / precision;
341
int xstart = roundf(x) - half_size;
342
int ystart = roundf(y) - half_size;
344
int xend = xstart + size;
345
int yend = xstart + size;
347
if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
352
if (xstart < minx) minx = xstart;
353
if (ystart < miny) miny = ystart;
354
if (xend > maxx) maxx = xend;
355
if (yend > maxy) maxy = yend;
360
size * sizeof(uint8_t),
361
fullimg + (xstart * height + ystart),
362
height * sizeof(uint8_t),
363
size * sizeof(uint8_t),
368
// Somehow check for constancy
369
// sum(sub_base(:)) == sub_base(1)*numel(sub_base)
370
// The values of TEMPLATE cannot all be the same
372
uint8_t *cudaInputPtr = ps->cuda_input_buffer + (icp + i) * side_alloc2;
373
cufftComplex *cudaPtr = ps->cuda_fft_buffer + (icp + i) * alloc_size;
376
cudaInputPtr, side_alloc * sizeof(uint8_t),
377
img, size * sizeof(uint8_t),
378
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
381
cudaMemset((void*)(ps->cuda_lsum_temp + 2*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));
382
cudaMemset((void*)(ps->cuda_lsum_temp + 3*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));
384
vecPackBase<<<input_grid_dim, input_block_dim>>>(
385
cudaInputPtr, side_alloc,
386
cudaRealPtr, fft_size,
387
lsum_temp, lsum_temp + lsum_step, ps->lsum_alloc_size, ps->lsum_size
277
390
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
279
ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,
280
lsum_temp + (2 * step), lsum_temp + (3 * step),
281
lsum_temp, lsum_temp + step);
283
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
289
static inline void fftGetPoints(TProcessingState *ps, mxArray *result) {
291
mxGetData(result), ps->ncp * sizeof(float),
292
ps->cuda_cp, ps->ncp_alloc_size * sizeof(float),
293
ps->ncp * sizeof(float), 2,
294
cudaMemcpyDeviceToHost
299
static inline mxArray *fftSetPoints(TProcessingState *ps, mxArray *result) {
303
static int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
392
ps->cuda_lsum_buffer + (icp + i) * alloc_size, ps->cuda_denom_buffer + (icp + i) * alloc_size,
393
lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
394
lsum_temp, lsum_temp + lsum_step);
396
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
412
static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
304
413
int width = mxGetN(image);
305
414
int height = mxGetM(image);
427
549
find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);
428
550
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
555
static inline mxArray *fftGetPoints(TProcessingState *ps) {
559
int ncp_alloc = ps->ncp_alloc_size;
560
int precision = ps->precision;
562
uint8_t *banlist = ps->banlist;
564
float *res_x = (float*)mxGetData(ps->coords);
565
float *res_y = res_x + ncp;
567
float *data_x, *data_y;
572
data_x = ps->points + 2 * ncp_alloc;
573
data_y = data_x + ncp_alloc;
577
float *frac_x = ps->points + 4 * ncp_alloc;
578
float *frac_y = frac_x + ncp_alloc;
580
float *move_x = ps->points + 6 * ncp_alloc;
581
float *move_y = move_x + ncp_alloc;
584
move_x, ncp_alloc * sizeof(float),
585
ps->cuda_cp, ncp_alloc * sizeof(float),
586
ps->ncp * sizeof(float), 2,
587
cudaMemcpyDeviceToHost
590
for (int i = 0;i < ncp;i++) {
592
res_x[i] = data_x[i];
593
res_y[i] = data_y[i];
597
frac = data_x[i] - round(data_x[i]*precision)/precision;
598
res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);
600
frac = data_y[i] - round(data_y[i]*precision)/precision;
601
res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
606
#ifdef USE_UNDOCUMENTED
607
return mxCreateSharedDataCopy(ps->coords);
608
// mxArray *mxCreateSharedDataCopy(const mxArray *pr);
609
// bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy); // true if not successful
610
// mxArray *mxUnreference(const mxArray *pr);
611
#else /* USE_UNDOCUMENTED */
612
return mxDuplicateArray(ps->coords);
613
#endif /* USE_UNDOCUMENTED */
618
static inline int fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
621
if (!ps->fft_initialized) {
622
reportError("cuFFT engine is not initialized yet");
626
int size = ps->fft_size;
627
int alloc_size = ps->fft_alloc_size;
629
int N = mxGetM(data);
632
int side_alloc = ps->side_alloc_size;
633
int side_alloc2 = side_alloc * side_alloc;
635
dim3 input_block_dim(N, 1, 1);
636
dim3 input_grid_dim(N, 1, 1);
638
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;
639
cufftReal *cudaRealPtr = ps->cuda_base_buffer;
641
dataPtr = (uint8_t*)mxGetData(data);
642
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
644
float *lsum_temp = ps->cuda_lsum_temp;
645
int step = ps->lsum_alloc_size * ps->lsum_alloc_size;
647
cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
648
cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));
650
vecPackBase<<<input_grid_dim, input_block_dim>>>(
653
lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
656
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
658
ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,
659
lsum_temp + (2 * step), lsum_temp + (3 * step),
660
lsum_temp, lsum_temp + step);
663
We don't really want to compute here
664
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_buffer + icp * alloc_size);
669
#endif /* VALIDATE_LSUM */
434
671
#ifdef VALIDATE_PEAK
435
672
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {
436
673
int size = ps->fft_size;
437
674
int size2 = size * size;
438
675
int alloc_size = ps->fft_alloc_size;
439
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
441
677
fftLoadFragment(ps, icp, 1, image);
443
679
mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
444
681
float *ar = (float*)mxGetPr(res);
446
cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
683
cudaMemcpy(ar, ps->cuda_final_buffer + icp * alloc_size, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
688
static inline mxArray *fftGetCorrections(TProcessingState *ps) {
690
int ncp_alloc = ps->ncp_alloc_size;
692
float *move_x = ps->points + 6 * ncp_alloc;
693
float *move_y = move_x + ncp_alloc;
696
move_x, ncp_alloc * sizeof(float),
697
ps->cuda_cp, ncp_alloc * sizeof(float),
698
ps->ncp * sizeof(float), 2,
699
cudaMemcpyDeviceToHost
702
mxArray *res = mxCreateNumericMatrix(ncp, 2, mxSINGLE_CLASS, mxREAL);
703
float *res_x = (float*)mxGetData(res);
704
float *res_y = res_x + ncp;
706
memcpy(res_x, move_x, ncp * sizeof(float));
707
memcpy(res_y, move_y, ncp * sizeof(float));
450
711
#endif /* VALIDATE_PEAK */
453
static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {
454
cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);
459
714
static void selfClean() {
602
866
case ACTION_COMPUTE_FRAGMENT:
603
867
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
604
868
plhs[0] = fftCompute(ps, icp, prhs[3]);
869
//fftGetCorrections(TProcessingState *ps, mxArray *result)
871
case ACTION_GET_CORRECTIONS:
872
plhs[0] = fftGetCorrections(ps);
606
874
#endif /* VALIDATE_PEAK */
609
reportError("This action expects 1 argument, but %i is passed", nrhs - 2);
615
if (mxGetClassID(input) != mxUINT8_CLASS) {
616
reportError("Invalid type of image data, should be 8bit integers");
620
for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
621
err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);
625
case ACTION_COMPUTE_BASE:
627
875
#ifdef VALIDATE_LSUM
629
#endif /* VALIDATE_LSUM */
631
reportError("This action expects 2 arguments, but %i is passed", nrhs - 2);
876
case ACTION_COMPUTE_BASE_FRAGMENT:
877
if ((nrhs != 4)&&(nrhs != 7)) {
878
reportError("ComputeBaseFragment action expects 2 arguments, but %i is passed", nrhs - 2);
678
#endif /* VALIDATE_LSUM */
680
925
fftUploadBaseData(ps, icp, base);
683
926
local_sum_validate(ps, icp, lsum, denom);
684
928
#endif /* VALIDATE_LSUM */
931
reportError("Compute action expects 1 argument, but %i is passed", nrhs - 2);
937
if (mxGetClassID(input) != mxUINT8_CLASS) {
938
reportError("Invalid type of image data, should be 8bit integers");
942
for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
943
err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);
948
case ACTION_COMPUTE_BASE:
950
reportError("ComputeBase action expects 1 argument, but %i is passed", nrhs - 2);
954
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
955
if (icp >= ps->ncp) {
956
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
962
if (mxGetNumberOfDimensions(base) != 2) {
963
reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
967
if (mxGetClassID(base) != mxUINT8_CLASS) {
968
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
972
width = mxGetN(base);
973
height = mxGetM(base);
975
size = 2 * ps->corr_size + 1;
978
base_size = 4 * ps->corr_size + 1;
979
base_size2 = base_size * base_size;
981
if (width * height > ps->ncp * size2) {
987
// if not enoguh space for caching enable anyway ?
988
if (width * height > ps->ncp * base_size2) {
994
ps->maxx = width - 1;
996
ps->maxy = height - 1;
1000
for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
1001
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), base);
1005
if ((ps->base_mode)&&(!ps->mode)) {
1006
// printf("%ux%u\n", width, height);
1008
// Correcting difference of area size between base and data images
1009
ps->minx += ps->corr_size;
1010
ps->miny += ps->corr_size;
1011
ps->maxx -= ps->corr_size;
1012
ps->maxy -= ps->corr_size;
1014
width = ceil(ps->maxx) - floor(ps->minx);
1015
height = ceil(ps->maxy) - floor(ps->miny);
1017
// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);
1018
if (width * height < ps->ncp * size2) {
1024
reportMessage("Running in the image mode");
1026
reportMessage("Running in the fragment mode");
1029
case ACTION_SET_BASE_POINTS:
1031
reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
1038
if ( (mxGetClassID(x) != mxSINGLE_CLASS)||
1039
(mxGetClassID(y) != mxSINGLE_CLASS)||
1040
(mxGetN(x)*mxGetM(x) != ps->ncp)||
1041
(mxGetN(y)*mxGetM(y) != ps->ncp)
1043
reportError("Invalid control points are specified");
1047
memcpy(ps->points, mxGetData(x), ps->ncp * sizeof(float));
1048
memcpy(ps->points + ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));
687
1050
case ACTION_SET_POINTS:
688
1051
if (nrhs != 4) {