13
13
#define min2(a,b) (((a)<(b))?(a):(b))
15
15
#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))
16
#define calc_blocks(size,rounding) (((size)/(rounding)) + (((size)%(rounding))?1:0))
18
static const char debruijn[32] = {
19
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
20
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
17
23
static TProcessingState *pstate = NULL;
19
25
static void fftFree(TProcessingState *ps) {
20
if (ps->cuda_fft_buffer) {
21
if (ps->coords) mxDestroyArray(ps->coords);
23
if (ps->banlist) free(ps->banlist);
25
cudaFree(ps->cuda_lsum_temp);
27
cudaFree(ps->cuda_lsum_buffer);
28
cudaFree(ps->cuda_denom_buffer);
29
cudaFree(ps->cuda_fft_buffer);
26
if (ps->coords) mxDestroyArray(ps->coords);
27
if (ps->banlist) free(ps->banlist);
28
if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
30
if (ps->cuda_lsum_buffer) cudaFree(ps->cuda_lsum_buffer);
31
if (ps->cuda_denom_buffer) cudaFree(ps->cuda_denom_buffer);
32
if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
31
cudaFree(ps->cuda_data_buffer);
32
cudaFree(ps->cuda_base_buffer);
34
cudaFree(ps->cuda_final_buffer);
35
cudaFree(ps->cuda_result_buffer);
36
cudaFree(ps->cuda_temp_buffer);
37
cudaFree(ps->cuda_input_buffer);
38
cudaFreeHost(ps->input_buffer);
40
cudaFree(ps->cuda_cp);
42
cudaFreeHost(ps->points);
34
if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
35
if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
37
if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
38
if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
39
if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
41
if (ps->cuda_cp) cudaFree(ps->cuda_cp);
42
if (ps->points) cudaFreeHost(ps->points);
45
44
// DS: Source of bug, that occasionaly can corrupt something ...
46
45
if (ps->cudpp_initialized) {
472
451
cudaMemcpyHostToHost
476
cufftComplex *cudaBasePtr = ps->cuda_fft_buffer + (i+icp) * alloc_size;
477
cufftReal *cudaResultPtr = ps->cuda_result_buffer + (i+icp) * alloc_size;
479
uint8_t *cudaInputPtr = ps->cuda_input_buffer + i*side_alloc2;
482
cudaInputPtr, side_alloc * sizeof(uint8_t),
455
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
483
456
img, size * sizeof(uint8_t),
484
457
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
487
vecPack<<<input_grid_dim, input_block_dim>>>(cudaInputPtr, side_alloc, cudaRealPtr, fft_size, size);
489
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);
491
vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaBasePtr, fft_size/2+1);
493
cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaResultPtr);
497
int cp_blocks1, cp_blocks, side_blocks;
498
if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;
499
else cp_blocks = ncp / CP_BLOCK_SIZE;
501
if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;
502
else side_blocks = size / SIDE_BLOCK_SIZE;
461
dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
462
dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
464
//int input_blocks = calc_blocks(size2, BLOCK_SIZE_2D);
467
int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
468
int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
469
int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
470
int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
471
int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
474
// Computing sum and std
504
475
int32_t *stat_buf = (int*)ps->cuda_temp_buffer;
506
477
float *sumbuf = ps->cuda_cp + icp;
507
478
float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;
509
dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
510
dim3 joint_grid_dim(side_blocks, cp_blocks, 1);
512
stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
514
if (ncp%BLOCK_SIZE_1D) cp_blocks1 = (ncp / BLOCK_SIZE_1D) + 1;
515
else cp_blocks1 = ncp / BLOCK_SIZE_1D;
480
dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
481
stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
517
482
stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
520
dim3 output_block_dim(fft_size, 1, 1);
521
dim3 output_grid_dim(fft_size, 1, 1);
523
for (int i = 0;i < ncp;i++) {
524
float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;
525
float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;
526
cufftReal *cudaRealPtr = ps->cuda_result_buffer + (i+icp)*alloc_size;
527
float *cudaResultPtr = ps->cuda_final_buffer + (i+icp)*alloc_size;
529
vecCompute<<<output_grid_dim, output_block_dim>>>(
531
cudaRealPtr, 1./(fft_size2 * (size2 - 1)),
532
cudaLSum, sumbuf+i, 1. / (size2 * (size2 - 1)),
484
// Packing input data for FFT
485
dim3 input_grid_dim(input_blocks, cp_blocks, 1);
487
char side_blocks_power;
488
if ((side_blocks&(side_blocks-1))) {
489
side_blocks_power = -1;
491
side_blocks_power = debruijn[((uint32_t)side_blocks * 0x077CB531) >> 27];
493
// printf("power %i\n", side_blocks_power);
495
// cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
496
if (side_blocks_power < 0) {
497
vecPack<<<input_grid_dim, block_side_cp>>>(
498
cuda_input_buffer, side_alloc2, side_alloc,
499
cuda_data_buffer, alloc_size, fft_size,
503
vecPackFast<<<input_grid_dim, block_side_cp>>>(
504
cuda_input_buffer, side_alloc2, side_alloc,
505
cuda_data_buffer, alloc_size, fft_size,
506
size, side_blocks_power
511
cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;
513
for (int i = 0;i < ncp;i++) {
514
if (banlist[i]) continue;
515
cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
518
int complex_blocks = calc_blocks(fft_size * (fft_size / 2 + 1), SIDE_BLOCK_SIZE);
519
dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
520
vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_size/2+1);
523
// First in-place transform for some reason is failing, therefore we
524
// have one alloc_size spacing between starts
525
cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
526
for (int i = 0;i < ncp;i++) {
527
if (banlist[i]) continue;
528
cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);
531
float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
533
int fft2_blocks = calc_blocks(fft_size*fft_size, SIDE_BLOCK_SIZE);
534
dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);
535
vecCompute<<<compute_grid_dim, block_side_cp>>>(
537
cuda_result_buffer, 1./(fft_size2 * (size2 - 1)),
538
ps->cuda_lsum_buffer + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
539
ps->cuda_denom_buffer + icp*alloc_size, stdbuf,
543
// Looking for maximum
539
544
float *xbuf = sumbuf;
540
545
float *ybuf = stdbuf;
542
547
int32_t *posbuf = (int*)ps->cuda_temp_buffer;
543
548
float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);
545
int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);
547
dim3 result_block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
548
550
dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
549
find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);
550
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
551
find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
552
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);