287
static int fftComputeFragment(TProcessingState *ps, int icp, const mxArray *data, float sum, float denom) {
292
int size = ps->fft_size;
293
int size2 = size * size;
294
int alloc_size = ps->fft_alloc_size;
296
int half_size = ps->corr_size;
297
int N = 2 * half_size + 1;
299
// int N = mxGetM(data);
303
dim3 input_block_dim(N, 1, 1);
304
dim3 input_grid_dim(N, 1, 1);
306
dim3 block_dim(size / 2 + 1, 1, 1);
307
dim3 grid_dim(size, 1, 1);
309
dim3 output_block_dim(size, 1, 1);
310
dim3 output_grid_dim(size, 1, 1);
312
// uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;
313
// cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
314
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
315
// cufftComplex *cudaDataPtr = ps->cuda_data_buffer;
316
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
319
dataPtr = (uint8_t*)mxGetData(data);
320
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
322
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);
324
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);
326
vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaPtr, size/2+1);
328
cudaRealPtr = ps->cuda_result_buffer;
329
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);
332
cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;
334
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
335
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
337
vecCompute<<<output_grid_dim, output_block_dim>>>(
339
cudaRealPtr, 1./(size2 * (N2 - 1)),
340
cudaLSum, sum / (N2 * (N2 - 1)),
346
int size = ps->fft_size;
347
int size2 = size * size;
348
int alloc_size = ps->fft_alloc_size;
350
int half_size = ps->corr_size;
351
int N = 2 * half_size + 1;
354
dim3 output_block_dim(size, 1, 1);
355
dim3 output_grid_dim(size, 1, 1);
357
cufftReal *cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;
358
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
359
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
360
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
362
vecCompute<<<output_grid_dim, output_block_dim>>>(
364
cudaRealPtr, 1./(size2 * (N2 - 1)),
365
cudaLSum, sum / (N2 * (N2 - 1)),
373
static inline mxArray *fftCompute(TProcessingState *ps, int icp) {
374
int size = ps->fft_size;
375
int size2 = size * size;
376
int alloc_size = ps->fft_alloc_size;
377
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
379
// fftComputeFragment(ps, icp, data, sum, denom);
381
mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
382
float *ar = (float*)mxGetPr(res);
384
cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
389
int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
289
static inline void fftGetPoints(TProcessingState *ps, mxArray *result) {
291
mxGetData(result), ps->ncp * sizeof(float),
292
ps->cuda_cp, ps->ncp_alloc_size * sizeof(float),
293
ps->ncp * sizeof(float), 2,
294
cudaMemcpyDeviceToHost
299
static inline mxArray *fftSetPoints(TProcessingState *ps, mxArray *result) {
303
static int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
390
304
int width = mxGetN(image);
391
305
int height = mxGetM(image);
461
int cp_blocks, side_blocks;
375
int cp_blocks1, cp_blocks, side_blocks;
462
376
if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;
463
377
else cp_blocks = ncp / CP_BLOCK_SIZE;
465
379
if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;
466
380
else side_blocks = size / SIDE_BLOCK_SIZE;
469
int *stat_buf = (int*)ps->cuda_data_buffer;
471
float *sumbuf = ps->cuda_cp;// + 2*ps->ncp + icp;
472
float *stdbuf = ps->cuda_cp + CP_BLOCK;
382
int32_t *stat_buf = (int*)ps->cuda_data_buffer;
384
float *sumbuf = ps->cuda_cp + icp;
385
float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;
474
387
dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
475
388
dim3 joint_grid_dim(side_blocks, cp_blocks, 1);
477
390
stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
479
if (ncp%BLOCK_SIZE_1D) cp_blocks = (ncp / BLOCK_SIZE_1D) + 1;
480
else cp_blocks = ncp / BLOCK_SIZE_1D;
482
stat2<<<cp_blocks, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
485
float *cp = (float*)malloc(2*CP_BLOCK*sizeof(float));
486
cudaMemcpy(cp, ps->cuda_cp, 2*CP_BLOCK*sizeof(float), cudaMemcpyDeviceToHost);
489
float *stdbuf1 = cp + CP_BLOCK;
392
if (ncp%BLOCK_SIZE_1D) cp_blocks1 = (ncp / BLOCK_SIZE_1D) + 1;
393
else cp_blocks1 = ncp / BLOCK_SIZE_1D;
395
stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
493
398
dim3 output_block_dim(fft_size, 1, 1);
494
399
dim3 output_grid_dim(fft_size, 1, 1);
497
401
for (int i = 0;i < ncp;i++) {
498
402
float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;
499
403
float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;
417
float *xbuf = sumbuf;
418
float *ybuf = stdbuf;
420
int32_t *posbuf = (int*)ps->cuda_data_buffer;
421
float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);
423
int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);
425
dim3 result_block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
426
dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
427
find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);
428
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
435
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {
436
int size = ps->fft_size;
437
int size2 = size * size;
438
int alloc_size = ps->fft_alloc_size;
439
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
441
fftLoadFragment(ps, icp, 1, image);
443
mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
444
float *ar = (float*)mxGetPr(res);
446
cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
450
#endif /* VALIDATE_PEAK */
518
453
static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {
665
598
// reportMessage("Executing normxcorr_hw action: %u", action);
667
600
switch (action) {
668
602
case ACTION_COMPUTE_FRAGMENT:
671
reportError("This action expects 4 arguments, but %i is passed", nrhs - 2);
676
reportError("This action expects single ouput, but %i is passed", nlhs);
680
603
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
681
/* if (icp >= ps->ncp) {
682
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
688
if (mxGetNumberOfDimensions(input) != 2) {
689
reportError("Invalid dimensionality of input matrix, 2D matrix is expected");
693
if (mxGetClassID(input) != mxUINT8_CLASS) {
694
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(input));
698
input_sum = mxGetScalar(prhs[4]);
699
input_denom = mxGetScalar(prhs[5]);
702
plhs[0] = fftCompute(ps, icp);
604
plhs[0] = fftCompute(ps, icp, prhs[3]);
606
#endif /* VALIDATE_PEAK */
704
607
case ACTION_COMPUTE:
706
609
reportError("This action expects 1 argument, but %i is passed", nrhs - 2);
813
730
iprop = (int)mxGetScalar(prhs[3]);
814
731
ps->corr_size = iprop;
815
732
ps->fft_size = 6 * iprop + 1;
816
ps->fft_size2 = ps->fft_size * ps->fft_size;
817
ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);
819
if (ps->fft_size % SIDE_BLOCK_SIZE) ps->side_alloc_size = (ps->fft_size / SIDE_BLOCK_SIZE + 1) * SIDE_BLOCK_SIZE;
820
else ps->side_alloc_size = ps->fft_size;
823
if (ps->fft_size2 % BLOCK_SIZE_1D) {
824
ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE_1D) + 1) * BLOCK_SIZE_1D;
826
ps->fft_alloc_size = ps->fft_size2;
829
733
ps->subimage_size = ps->corr_size * 4 + 1;
735
ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);
736
ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);
737
ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);
830
739
ps->lsum_size = ps->corr_size * 2 + 1;
832
740
ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
834
if (ps->fft_size % BLOCK_SIZE_2D) {
835
ps->lsum_short_aligned_size = ((ps->fft_size / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
837
ps->lsum_short_aligned_size = ps->fft_size;
840
if ((ps->lsum_temp_size) % BLOCK_SIZE_2D) {
841
ps->lsum_aligned_size = (((ps->lsum_temp_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
843
ps->lsum_aligned_size = ps->lsum_temp_size;
846
if ((ps->lsum_temp_size + ps->lsum_size) % BLOCK_SIZE_2D) {
847
ps->lsum_alloc_size = (((ps->lsum_temp_size + ps->lsum_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;
849
ps->lsum_alloc_size = ps->lsum_temp_size + ps->lsum_size;
742
ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);
743
ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);
744
ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);
852
746
err = fftInit(ps);