477
425
copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
478
426
copy_params.kind = cudaMemcpyHostToDevice;
479
cudaMemcpy3D(©_params);
482
dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
483
dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
428
cudaMemcpy3DAsync(©_params, stream0);
433
static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
434
static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
436
static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
437
int half_size = ps->corr_size;
438
int size = 2 * half_size + 1;
440
int fft_real_size = ps->fft_real_size;
442
int ncp_alloc = ps->ncp_alloc_size;
443
int alloc_size = ps->fft_alloc_size;
444
int side_alloc = ps->side_alloc_size;
445
int side_alloc2 = side_alloc * side_alloc;
447
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
448
float *cuda_data_buffer = ps->cuda_data_buffer;
485
450
int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
486
451
int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
487
452
int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
488
453
int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
489
int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
491
// Computing sum and std
455
float *sumbuf = ps->cuda_points + icp;
456
float *stdbuf = ps->cuda_points + ncp_alloc + icp;
492
458
int32_t *stat_buf = (int*)ps->cuda_temp_buffer;
494
float *sumbuf = ps->cuda_points + icp;
495
float *stdbuf = ps->cuda_points + ps->ncp_alloc_size + icp;
497
460
dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
498
stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
499
stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
461
stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
462
stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
501
464
// Packing input data for FFT
502
465
dim3 input_grid_dim(input_blocks, cp_blocks, 1);
504
467
if (ps->side_blocks_power < 0) {
505
vecPack<<<input_grid_dim, block_side_cp>>>(
468
vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(
506
469
cuda_input_buffer, side_alloc2, side_alloc,
507
470
cuda_data_buffer, alloc_size, fft_real_size,
508
471
size, side_blocks
511
vecPackFast<<<input_grid_dim, block_side_cp>>>(
474
vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(
512
475
cuda_input_buffer, side_alloc2, side_alloc,
513
476
cuda_data_buffer, alloc_size, fft_real_size,
514
477
size, ps->side_blocks_power
519
cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;
521
for (int i = 0;i < ncp;i++) {
522
if (banlist[i]) continue;
523
cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
526
int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
527
dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
528
vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);
530
// First in-place transform for some reason is failing, therefore we
531
// have one alloc_size spacing between starts
484
static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
485
int half_size = ps->corr_size;
486
int size = 2 * half_size + 1;
487
int size2 = size * size;
489
int fft_size = ps->fft_size;
490
int fft_real_size = ps->fft_real_size;
492
int ncp_alloc = ps->ncp_alloc_size;
493
int alloc_size = ps->fft_alloc_size;
494
int side_alloc = ps->side_alloc_size;
496
int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
497
int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
498
int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
532
500
cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
533
for (int i = 0;i < ncp;i++) {
534
if (banlist[i]) continue;
535
cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);
538
501
float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
503
float *sumbuf = ps->cuda_points + icp;
504
float *stdbuf = ps->cuda_points + ncp_alloc + icp;
540
506
// Use real size everthere
541
507
// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
542
// vecCompute<<<compute_grid_dim, block_side_cp>>>(
508
// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
543
509
// cuda_final_buffer,
544
510
// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
545
511
// ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
569
538
// Use real size everthere
570
539
// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);
571
540
// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
572
find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
573
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
578
static inline mxArray *fftGetPoints(TProcessingState *ps) {
542
find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
543
find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
548
static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
549
int fft_real_size = ps->fft_real_size;
551
int alloc_size = ps->fft_alloc_size;
553
uint8_t *banlist = ps->banlist + icp;
554
float *cuda_data_buffer = ps->cuda_data_buffer;
556
int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
559
cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;
561
cufftSetStream(ps->cufft_r2c_plan, stream0);
562
cufftSetStream(ps->cufft_c2r_plan, stream0);
564
for (int i = 0;i < ncp;i++) {
565
if (banlist[i]) continue;
566
cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
569
int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
570
dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
571
vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);
573
// First in-place transform for some reason is failing, therefore we
574
// have one alloc_size spacing between starts (see cuda_fft_buffer set above)
575
cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
576
for (int i = 0;i < ncp;i++) {
577
if (banlist[i]) continue;
578
cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);
585
static inline int fftGetCurrentPoints(DICTContext ps) {
587
int ncp_alloc = ps->ncp_alloc_size;
588
int precision = ps->precision;
590
float *move_x = ps->points + 6 * ncp_alloc;
591
float *move_y = move_x + ncp_alloc;
594
move_x, ncp_alloc * sizeof(float),
595
ps->cuda_points, ncp_alloc * sizeof(float),
596
ps->ncp * sizeof(float), 2,
597
cudaMemcpyDeviceToHost
600
float *data_x, *data_y;
605
data_x = ps->points + 2 * ncp_alloc;
606
data_y = data_x + ncp_alloc;
609
float *res_x, *res_y;
610
if ((ps->res_x)&&(ps->res_y)) {
582
int ncp_alloc = ps->ncp_alloc_size;
583
int precision = ps->precision;
621
float *frac_x = ps->points + 4 * ncp_alloc;
622
float *frac_y = frac_x + ncp_alloc;
585
623
uint8_t *banlist = ps->banlist;
587
float *res_x = (float*)mxGetData(ps->coords);
588
float *res_y = res_x + ncp;
590
float *data_x, *data_y;
595
data_x = ps->points + 2 * ncp_alloc;
596
data_y = data_x + ncp_alloc;
600
float *frac_x = ps->points + 4 * ncp_alloc;
601
float *frac_y = frac_x + ncp_alloc;
603
float *move_x = ps->points + 6 * ncp_alloc;
604
float *move_y = move_x + ncp_alloc;
607
move_x, ncp_alloc * sizeof(float),
608
ps->cuda_points, ncp_alloc * sizeof(float),
609
ps->ncp * sizeof(float), 2,
610
cudaMemcpyDeviceToHost
613
625
for (int i = 0;i < ncp;i++) {
615
res_x[i] = data_x[i];
616
res_y[i] = data_y[i];
620
frac = data_x[i] - round(data_x[i]*precision)/precision;
621
res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);
623
frac = data_y[i] - round(data_y[i]*precision)/precision;
624
res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
629
#ifdef USE_UNDOCUMENTED
630
return mxCreateSharedDataCopy(ps->coords);
631
// mxArray *mxCreateSharedDataCopy(const mxArray *pr);
632
// bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy); // true if not successful
633
// mxArray *mxUnreference(const mxArray *pr);
634
#else /* USE_UNDOCUMENTED */
635
return mxDuplicateArray(ps->coords);
636
#endif /* USE_UNDOCUMENTED */
641
static inline int fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
644
if (!ps->fft_initialized) {
645
reportError("cuFFT engine is not initialized yet");
649
int size = ps->fft_size;
650
int alloc_size = ps->fft_alloc_size;
652
int N = mxGetM(data);
655
int side_alloc = ps->side_alloc_size;
656
int side_alloc2 = side_alloc * side_alloc;
658
dim3 input_block_dim(N, 1, 1);
659
dim3 input_grid_dim(N, 1, 1);
661
uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;
662
cufftReal *cudaRealPtr = ps->cuda_base_buffer;
664
dataPtr = (uint8_t*)mxGetData(data);
665
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
667
float *lsum_temp = ps->cuda_lsum_temp;
668
int step = ps->lsum_alloc_size * ps->lsum_alloc_size;
670
cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
671
cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));
673
vecPackBase<<<input_grid_dim, input_block_dim>>>(
676
lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
679
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
681
ps->cuda_lsum_cache + icp * alloc_size, ps->cuda_denom_cache + icp * alloc_size,
682
lsum_temp + (2 * step), lsum_temp + (3 * step),
683
lsum_temp, lsum_temp + step);
686
We don't really want to compute here
687
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_cache + icp * alloc_size);
627
res_x[i] = data_x[i];
628
res_y[i] = data_y[i];
632
frac = data_x[i] - round(data_x[i]*precision)/precision;
633
res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);
635
frac = data_y[i] - round(data_y[i]*precision)/precision;
636
res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
692
#endif /* VALIDATE_LSUM */
695
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {
696
int size = ps->fft_size;
697
int size2 = size * size;
698
int alloc_size = ps->fft_alloc_size;
700
fftLoadFragment(ps, icp, 1, image);
702
mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
704
float *ar = (float*)mxGetPr(res);
706
cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
707
float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
708
cudaMemcpy(ar, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
713
static inline mxArray *fftGetCorrections(TProcessingState *ps) {
715
int ncp_alloc = ps->ncp_alloc_size;
717
float *move_x = ps->points + 6 * ncp_alloc;
718
float *move_y = move_x + ncp_alloc;
721
move_x, ncp_alloc * sizeof(float),
722
ps->cuda_points, ncp_alloc * sizeof(float),
723
ps->ncp * sizeof(float), 2,
724
cudaMemcpyDeviceToHost
727
mxArray *res = mxCreateNumericMatrix(ncp, 2, mxSINGLE_CLASS, mxREAL);
728
float *res_x = (float*)mxGetData(res);
729
float *res_y = res_x + ncp;
731
memcpy(res_x, move_x, ncp * sizeof(float));
732
memcpy(res_y, move_y, ncp * sizeof(float));
736
#endif /* VALIDATE_PEAK */
739
static void selfClean() {
741
reportMessage("Self-cleaning normxcorr_hw instance");
749
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
754
cudaDeviceProp deviceProp;
759
TProcessingState *ps;
765
const mxArray *input;
770
const mxArray *denom;
771
const mxArray *nonzero;
772
#endif /* VALIDATE_LSUM */
774
const mxArray *x, *y;
781
int base_size, base_size2;
782
int base_blocks, side_blocks;
785
reportMessage("Initializing normxcorr_hw instance");
788
reportError("You should accept a single result from initialization call");
793
reportError("Only a single calculation process is supported at the moment");
797
// Initialising, for now a single client is supported only
799
idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);
801
reportError("Initialization is failed");
806
// Detecting cuda devices
808
cudaGetDeviceCount(&deviceCount);
810
cudaGetDeviceProperties(&deviceProp, 0);
811
if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {
813
} else { // Hardware capabilities are bellow 1.3
816
} else { // No cuda device, using software
822
pstate = pstateInit();
824
mxDestroyArray(idMatrix);
825
reportError("State structure initialization is failed");
833
idPtr = (int32_t*)mxGetData(idMatrix);
838
mexAtExit(selfClean);
843
reportError("Normxcorr_hw should be initialized first");
848
idMatrix = (mxArray*)prhs[0];
849
if ((mxGetClassID(idMatrix) != mxINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {
850
reportError("Invalid parameter is supplied in place of process identificator");
854
idPtr = (int32_t*)mxGetData(idMatrix);
856
reportError("Mex is not able to obtain process identificator");
862
reportError("Invalid process identificator is supplied");
867
reportError("The interface is not initialized");
876
reportMessage("Cleaning normxcorr_hw instance");
887
action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));
889
// reportMessage("Executing normxcorr_hw action: %u", action);
893
case ACTION_COMPUTE_FRAGMENT:
894
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
895
plhs[0] = fftCompute(ps, icp, prhs[3]);
896
//fftGetCorrections(TProcessingState *ps, mxArray *result)
898
case ACTION_GET_CORRECTIONS:
899
plhs[0] = fftGetCorrections(ps);
901
#endif /* VALIDATE_PEAK */
903
case ACTION_COMPUTE_BASE_FRAGMENT:
904
if ((nrhs != 4)&&(nrhs != 7)) {
905
reportError("ComputeBaseFragment action expects 2 arguments, but %i is passed", nrhs - 2);
909
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
910
if (icp >= ps->ncp) {
911
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
917
if (mxGetNumberOfDimensions(base) != 2) {
918
reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
922
if (mxGetClassID(base) != mxUINT8_CLASS) {
923
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
928
iprop = ps->fft_size;
934
(mxGetNumberOfDimensions(lsum) != 2)||
935
(mxGetNumberOfDimensions(denom) != 2)||
936
(mxGetClassID(lsum) != mxSINGLE_CLASS)||
937
(mxGetClassID(denom) != mxSINGLE_CLASS)||
938
(mxGetClassID(nonzero) != mxUINT16_CLASS)||
939
(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
940
(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
943
reportError("Invalid properties for base initialization are specified");
952
fftUploadBaseData(ps, icp, base);
953
local_sum_validate(ps, icp, lsum, denom);
955
#endif /* VALIDATE_LSUM */
958
reportError("Compute action expects 1 argument, but %i is passed", nrhs - 2);
964
if (mxGetClassID(input) != mxUINT8_CLASS) {
965
reportError("Invalid type of image data, should be 8bit integers");
969
for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
970
err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);
975
case ACTION_COMPUTE_BASE:
977
reportError("ComputeBase action expects 1 argument, but %i is passed", nrhs - 2);
981
icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
982
if (icp >= ps->ncp) {
983
reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
989
if (mxGetNumberOfDimensions(base) != 2) {
990
reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
994
if (mxGetClassID(base) != mxUINT8_CLASS) {
995
reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
999
width = mxGetN(base);
1000
height = mxGetM(base);
1002
size = 2 * ps->corr_size + 1;
1003
size2 = size * size;
1005
base_size = 4 * ps->corr_size + 1;
1006
base_size2 = base_size * base_size;
1008
if (width * height > ps->ncp * size2) {
1014
// if not enoguh space for caching enable anyway ?
1015
if (width * height > ps->ncp * base_size2) {
1021
ps->maxx = width - 1;
1023
ps->maxy = height - 1;
1027
for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
1028
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), base);
1032
if ((ps->base_mode)&&(!ps->mode)) {
1033
// printf("%ux%u\n", width, height);
1035
// Correcting difference of area size between base and data images
1036
ps->minx += ps->corr_size;
1037
ps->miny += ps->corr_size;
1038
ps->maxx -= ps->corr_size;
1039
ps->maxy -= ps->corr_size;
1041
width = ceil(ps->maxx) - floor(ps->minx);
1042
height = ceil(ps->maxy) - floor(ps->miny);
1044
// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);
1045
if (width * height < ps->ncp * size2) {
1051
reportMessage("Running in the image mode");
1053
reportMessage("Running in the fragment mode");
1056
case ACTION_SET_BASE_POINTS:
1058
reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
1065
if ( (mxGetClassID(x) != mxSINGLE_CLASS)||
1066
(mxGetClassID(y) != mxSINGLE_CLASS)||
1067
(mxGetN(x)*mxGetM(x) != ps->ncp)||
1068
(mxGetN(y)*mxGetM(y) != ps->ncp)
1070
reportError("Invalid control points are specified");
1074
memcpy(ps->points, mxGetData(x), ps->ncp * sizeof(float));
1075
memcpy(ps->points + ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));
1077
case ACTION_SET_POINTS:
1079
reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
1086
if ( (mxGetClassID(x) != mxSINGLE_CLASS)||
1087
(mxGetClassID(y) != mxSINGLE_CLASS)||
1088
(mxGetN(x)*mxGetM(x) != ps->ncp)||
1089
(mxGetN(y)*mxGetM(y) != ps->ncp)
1091
reportError("Invalid control points are specified");
1095
memcpy(ps->points + 2 * ps->ncp_alloc_size, mxGetData(x), ps->ncp * sizeof(float));
1096
memcpy(ps->points + 3 * ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));
1100
case ACTION_GET_POINTS:
1102
reportError("GetPoints action do not expect any arguments");
1106
reportError("GetPoints action returns a single matrix");
1110
plhs[0] = fftGetPoints(ps);
1114
reportError("SETUP action expects 'ncp', 'corrsize', 'precision', and 'optimization level' parameters");
1120
ps->ncp = (int)mxGetScalar(prhs[2]);
1122
ps->precision = (int)mxGetScalar(prhs[4]);
1124
optimize = (int)mxGetScalar(prhs[5]);
1126
iprop = (int)mxGetScalar(prhs[3]);
1127
ps->corr_size = iprop;
1128
ps->subimage_size = ps->corr_size * 4 + 1;
1129
ps->fft_size = 6 * iprop + 1;
1132
ps->fft_real_size = next_power(ps->fft_size);
1134
ps->fft_real_size = ps->fft_size;
1137
ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);
1138
ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);
1140
// ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);
1141
ps->fft_alloc_size = calc_alloc(ps->fft_real_size * ps->fft_real_size, BLOCK_SIZE_1D);
1143
ps->lsum_size = ps->corr_size * 2 + 1;
1144
ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
1146
ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);
1147
ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);
1148
ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);
1153
idMatrix = mxCreateNumericMatrix(1, 1, mxINT64_CLASS, mxREAL);
1155
errPtr = (int64_t*)mxGetData(idMatrix);
1159
reportError("Initialization of result matrix is failed");
1164
side_blocks = calc_blocks(2 * ps->corr_size + 1, SIDE_BLOCK_SIZE);
1165
if ((side_blocks&(side_blocks-1))) {
1166
ps->side_blocks_power = -1;
1168
ps->side_blocks_power = debruijn[((uint32_t)side_blocks * 0x077CB531) >> 27];
1171
base_blocks = calc_blocks(4 * ps->corr_size + 1, BLOCK_SIZE_1D);
1172
if ((base_blocks&(base_blocks-1))) {
1173
ps->base_blocks_power = -1;
1175
ps->base_blocks_power = debruijn[((uint32_t)base_blocks * 0x077CB531) >> 27];
1178
case ACTION_PREPARE:
1182
reportError("Unknown request %i", action);