250
252
return DICT_ERROR_CUDA_MALLOC;
255
ps->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));
256
if (!ps->base_image) {
257
reportError("Memory allocation of %u*%u*uint8_t bytes for base_image is failed", ps->width, ps->height);
259
return DICT_ERROR_MALLOC;
369
378
cudaMemcpyHostToDevice
371
380
// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
381
} else if (!ps->use_cache) {
382
memcpy(ps->base_image, fullimg, width * height * sizeof(uint8_t));
404
416
float *lsum_temp = (float*)ps->cuda_lsum_temp;
405
417
int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
419
uint8_t *img = ps->input_buffer;
407
420
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
408
421
cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
441
454
size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice
445
458
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
446
459
fullimg + offset, matlab_width * sizeof(uint8_t),
447
460
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
452
uint8_t *img = ps->input_buffer;
454
if (ps->matlab_mode) {
456
img + i * alloc_size,
457
size * sizeof(uint8_t),
458
fullimg + (xstart * height + ystart),
459
height * sizeof(uint8_t),
460
size * sizeof(uint8_t),
464
img + i * size2, size * sizeof(uint8_t),
465
fullimg + offset, matlab_width * sizeof(uint8_t),
466
size * sizeof(uint8_t), size,
462
467
cudaMemcpyHostToHost
466
img + i * alloc_size,
467
size * sizeof(uint8_t),
468
fullimg + (ystart * width + xstart),
469
width * sizeof(uint8_t),
470
size * sizeof(uint8_t),
477
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
478
img + i * alloc_size, size * sizeof(uint8_t),
479
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
475
cudaMemcpy3DParms copy_params = { 0 };
477
copy_params.dstPtr = make_cudaPitchedPtr(
478
cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
480
copy_params.srcPtr = make_cudaPitchedPtr(
481
img, size * sizeof(uint8_t), size, size
483
copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
484
copy_params.kind = cudaMemcpyHostToDevice;
486
cudaMemcpy3D(©_params);
490
for (int i = 0;i < ncp;i++) {
491
if (banlist[i]) continue;
484
493
if (ps->base_blocks_power < 0) {
485
494
vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(
486
495
cuda_input_buffer + i * side_alloc2, side_alloc,
664
672
size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice
669
676
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
670
677
fullimg + offset, matlab_width * sizeof(uint8_t),
671
678
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
675
682
img + i * size2, size * sizeof(uint8_t),
676
683
fullimg + offset, matlab_width * sizeof(uint8_t),
677
684
size * sizeof(uint8_t), size,
678
685
cudaMemcpyHostToHost
682
cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),
683
img + i * size2, size * sizeof(uint8_t),
684
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
693
691
if (!image_mode) {
694
692
cudaMemcpy3DParms copy_params = { 0 };
713
710
static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
714
711
static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
716
static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
713
static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp) {
717
714
int half_size = ps->corr_size;
718
715
int size = 2 * half_size + 1;
738
735
int32_t *stat_buf = (int*)ps->cuda_temp_buffer;
740
737
dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
741
stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
742
stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
738
stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
739
stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
744
741
// Packing input data for FFT
745
742
dim3 input_grid_dim(input_blocks, cp_blocks, 1);
747
744
if (ps->side_blocks_power < 0) {
748
vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(
745
vecPack<<<input_grid_dim, block_side_cp>>>(
749
746
cuda_input_buffer, side_alloc2, side_alloc,
750
747
cuda_data_buffer, alloc_size, fft_real_size,
751
748
size, side_blocks
754
vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(
751
vecPackFast<<<input_grid_dim, block_side_cp>>>(
755
752
cuda_input_buffer, side_alloc2, side_alloc,
756
753
cuda_data_buffer, alloc_size, fft_real_size,
757
754
size, ps->side_blocks_power
764
static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
761
static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp) {
765
762
int half_size = ps->corr_size;
766
763
int size = 2 * half_size + 1;
767
764
int size2 = size * size;
788
785
// Use real size everthere
789
786
// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
790
// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
787
// vecCompute<<<compute_grid_dim, block_side_cp>>>(
791
788
// cuda_final_buffer,
792
789
// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
793
790
// ps->cuda_lsum_cache + cache_icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
799
796
int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;
800
797
dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);
802
vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(
799
vecCompute<<<compute_grid_dim, block_side_cp>>>(
803
800
cuda_final_buffer, fft_size,
804
801
cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),
805
802
ps->cuda_lsum_cache + cache_icp * alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
821
818
// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);
822
819
// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
824
find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
825
find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
821
find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
822
find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
830
static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
827
static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp) {
831
828
int fft_real_size = ps->fft_real_size;
833
830
int alloc_size = ps->fft_alloc_size;
841
838
// Performing FFT's
842
839
cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;
844
cufftSetStream(ps->cufft_r2c_plan, stream0);
845
cufftSetStream(ps->cufft_c2r_plan, stream0);
847
841
for (int i = 0;i < ncp;i++) {
848
842
if (banlist[i]) continue;
849
843
cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
852
846
int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
853
847
dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
854
vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);
848
vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);
856
850
// First in-place transform for some reason is failing, therefore we
857
851
// have one alloc_size spacing between starts (see cuda_fft_buffer set above)