98
108
return ERROR_CUDA_MALLOC;
101
cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, ps->fft_alloc_size * sizeof(cufftComplex));
113
ps->fft_alloc_size * sizeof(cufftComplex), /* FFT multiplication */
114
2 * CP_BLOCK * ps->side_alloc_size * sizeof(int) /* Sum, Std computations */
117
cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, size);
103
119
reportError("Device memory allocation of %u*cufftComplex bytes for cuda_data_buffer is failed", ps->fft_alloc_size);
105
121
return ERROR_CUDA_MALLOC;
108
cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t));
110
reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, ps->fft_alloc_size);
124
cuda_err = cudaHostAlloc((void**)&ps->data_x, ps->ncp * sizeof(float), 0);
125
if (!cuda_err) cuda_err = cudaHostAlloc((void**)&ps->data_y, ps->ncp * sizeof(float), 0);
127
reportError("Host memory allocation of 2*%u*float bytes for control points is failed", ps->ncp);
129
return ERROR_CUDA_MALLOC;
132
cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * CP_BLOCK * sizeof(float));
134
reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
136
return ERROR_CUDA_MALLOC;
139
cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * side_alloc_size2 * sizeof(uint8_t));
141
reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
143
return ERROR_CUDA_MALLOC;
146
cuda_err = cudaHostAlloc((void**)&ps->input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
148
reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", ps->ncp, ps->fft_alloc_size);
112
150
return ERROR_CUDA_MALLOC;
279
328
cudaRealPtr = ps->cuda_result_buffer;
280
329
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);
282
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
283
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
285
vecCompute<<<output_grid_dim, output_block_dim>>>(
287
cudaRealPtr, 1./(size2 * (N2 - 1)),
288
cudaLSum, sum / (N2 * (N2 - 1)),
293
res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
332
cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;
334
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
335
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
337
vecCompute<<<output_grid_dim, output_block_dim>>>(
339
cudaRealPtr, 1./(size2 * (N2 - 1)),
340
cudaLSum, sum / (N2 * (N2 - 1)),
346
int size = ps->fft_size;
347
int size2 = size * size;
348
int alloc_size = ps->fft_alloc_size;
350
int half_size = ps->corr_size;
351
int N = 2 * half_size + 1;
354
dim3 output_block_dim(size, 1, 1);
355
dim3 output_grid_dim(size, 1, 1);
357
cufftReal *cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;
358
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
359
float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;
360
float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;
362
vecCompute<<<output_grid_dim, output_block_dim>>>(
364
cudaRealPtr, 1./(size2 * (N2 - 1)),
365
cudaLSum, sum / (N2 * (N2 - 1)),
373
static inline mxArray *fftCompute(TProcessingState *ps, int icp) {
374
int size = ps->fft_size;
375
int size2 = size * size;
376
int alloc_size = ps->fft_alloc_size;
377
float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;
379
// fftComputeFragment(ps, icp, data, sum, denom);
381
mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
382
float *ar = (float*)mxGetPr(res);
296
384
cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
389
int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
390
int width = mxGetN(image);
391
int height = mxGetM(image);
393
int half_size = ps->corr_size;
394
int size = 2 * half_size + 1;
395
int size2 = size * size;
397
int fft_size = ps->fft_size;
398
int fft_size2 = fft_size * fft_size;
399
int alloc_size = ps->fft_alloc_size;
400
int side_alloc = ps->side_alloc_size;
401
int side_alloc2 = side_alloc * side_alloc;
403
uint8_t *fullimg = ((uint8_t*)mxGetData(image));
404
uint8_t *img = ps->input_buffer;
406
cufftComplex *cudaDataPtr = (cufftComplex*)ps->cuda_data_buffer;
407
cufftReal *cudaRealPtr = ps->cuda_temp_buffer;
409
dim3 input_block_dim(size, 1, 1);
410
dim3 input_grid_dim(size, 1, 1);
411
dim3 block_dim(fft_size / 2 + 1, 1, 1);
412
dim3 grid_dim(fft_size, 1, 1);
414
for (int i = 0;i < ncp;i++) {
415
float x = ps->data_x[i+icp] - 1;
416
float y = ps->data_y[i+icp] - 1;
418
int xstart = roundf(x) - half_size;
419
int ystart = roundf(y) - half_size;
421
int xend = xstart + size;
422
int yend = xstart + size;
424
if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
425
// Somehow mark we have skipped it
431
size * sizeof(uint8_t),
432
fullimg + (xstart * height + ystart),
433
height * sizeof(uint8_t),
434
size * sizeof(uint8_t),
440
cufftComplex *cudaBasePtr = ps->cuda_base_buffer + (i+icp) * alloc_size;
441
cufftReal *cudaResultPtr = ps->cuda_result_buffer + (i+icp) * alloc_size;
443
uint8_t *cudaInputPtr = ps->cuda_input_buffer + i*side_alloc2;
446
cudaInputPtr, side_alloc * sizeof(uint8_t),
447
img, size * sizeof(uint8_t),
448
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
451
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, fft_size, cudaInputPtr, side_alloc, size);
453
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);
455
vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaBasePtr, fft_size/2+1);
457
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaResultPtr);
461
int cp_blocks, side_blocks;
462
if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;
463
else cp_blocks = ncp / CP_BLOCK_SIZE;
465
if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;
466
else side_blocks = size / SIDE_BLOCK_SIZE;
469
int *stat_buf = (int*)ps->cuda_data_buffer;
471
float *sumbuf = ps->cuda_cp;// + 2*ps->ncp + icp;
472
float *stdbuf = ps->cuda_cp + CP_BLOCK;
474
dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
475
dim3 joint_grid_dim(side_blocks, cp_blocks, 1);
477
stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
479
if (ncp%BLOCK_SIZE_1D) cp_blocks = (ncp / BLOCK_SIZE_1D) + 1;
480
else cp_blocks = ncp / BLOCK_SIZE_1D;
482
stat2<<<cp_blocks, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
485
float *cp = (float*)malloc(2*CP_BLOCK*sizeof(float));
486
cudaMemcpy(cp, ps->cuda_cp, 2*CP_BLOCK*sizeof(float), cudaMemcpyDeviceToHost);
489
float *stdbuf1 = cp + CP_BLOCK;
493
dim3 output_block_dim(fft_size, 1, 1);
494
dim3 output_grid_dim(fft_size, 1, 1);
497
for (int i = 0;i < ncp;i++) {
498
float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;
499
float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;
500
cufftReal *cudaRealPtr = ps->cuda_result_buffer + (i+icp)*alloc_size;
501
float *cudaResultPtr = ps->cuda_final_buffer + (i+icp)*alloc_size;
503
vecCompute<<<output_grid_dim, output_block_dim>>>(
505
cudaRealPtr, 1./(fft_size2 * (size2 - 1)),
506
cudaLSum, sumbuf+i, 1. / (size2 * (size2 - 1)),
303
518
static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {