6
6
static void fftFree(TProcessingState *ps) {
7
if (ps->banlist) free(ps->banlist);
7
if (ps->cuda_image) cudaFree(ps->cuda_image);
8
if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);
8
10
if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
12
if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
10
13
if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
11
if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
12
14
if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
14
17
if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
15
18
if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
17
if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
20
if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
18
21
if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
19
if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
21
23
if (ps->cuda_points) cudaFree(ps->cuda_points);
22
24
if (ps->points) cudaFreeHost(ps->points);
26
if (ps->banlist) free(ps->banlist);
28
if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
24
30
if (ps->cudpp_initialized) {
25
31
cudppDestroyPlan(ps->cudpp_plan);
28
34
if (ps->fft_initialized) {
29
cufftDestroy(ps->cufft_r2c_plan);
30
35
cufftDestroy(ps->cufft_c2r_plan);
36
cufftDestroy(ps->cufft_r2c_plan);
33
39
if (ps->image_buf) {
43
memset(ps, 0, ((char*)&(ps->matlab_mode)) - ((char*)ps));
37
46
#ifdef DICT_HW_MEASURE_TIMINGS
38
47
memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));
39
#else /* DICT_HW_MEASURE_TIMINGS */
40
49
memset(ps, 0, sizeof(TProcessingState));
41
#endif /* DICT_HW_MEASURE_TIMINGS */
45
static int fftInit(TProcessingState *ps) {
54
static int fftInit(TProcessingState *ps, size_t device_memory) {
46
55
CUDPPConfiguration cudpp_config;
48
57
CUDPPResult cudpp_err;
158
161
cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
160
cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
163
cache_memory = size + // cuda_temp_buffer
164
2 * ps->ncp_alloc_size * sizeof(float) + // cuda_points
165
CP_BLOCK * side_alloc_size2 * sizeof(uint8_t) + // cuda_input_buffer
166
2 * CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal) + // cuda_base_buffer, cuda_data_buffer
167
4 * lsum_alloc_size2 * sizeof(float) + // cuda_lsum_temp
168
ps->ncp * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)); // caches
171
printf("Temp buffer : %i\n", size/1024/1024);
172
printf("Points : %i\n", 2 * ps->ncp_alloc_size * sizeof(float) / 1024 / 1024);
173
printf("Input buffer: %i\n", CP_BLOCK * side_alloc_size2 * sizeof(uint8_t) / 1024 / 1024);
174
printf("Data buffer : 2 x %i\n", CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal) / 1024 / 1024);
175
printf("Lsum temp : %i\n", 4 * lsum_alloc_size2 * sizeof(float) / 1024 / 1024);
176
printf("Cache : %i\n", ps->ncp * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)) / 1024 / 1024);
177
printf("No Cache : %i\n", CP_BLOCK * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)) / 1024 / 1024);
180
// Counting necessary memory, here is cache memory, 64MB is considered for other needs (base and current images)
181
if ((ps->use_cache)&&((cache_memory + 67108864) > device_memory)) ps->use_cache = 0;
182
// ps->use_cache = 0;
184
ncp_cache = ps->use_cache?ps->ncp:CP_BLOCK;
186
cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ncp_cache * ps->fft_alloc_size * sizeof(cufftComplex));
188
// Try to disable caching
192
ncp_cache = CP_BLOCK;
193
cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ncp_cache * ps->fft_alloc_size * sizeof(cufftComplex));
196
reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);
198
return DICT_ERROR_CUDA_MALLOC;
201
// the data could be stored across the calls
202
// cudaMemset(ps->cuda_fft_cache, 0, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
204
cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ncp_cache * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
162
206
reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);
164
208
return DICT_ERROR_CUDA_MALLOC;
167
cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
211
cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ncp_cache * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
169
213
reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);
232
int fftSetupDimensions(TProcessingState *ps) {
234
int image_size = ps->width * ps->height;
236
if (ps->cuda_image) cudaFree(ps->cuda_image);
237
if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);
239
cuda_err = cudaMalloc((void**)&ps->cuda_base_image, image_size * sizeof(uint8_t));
241
reportError("Device memory allocation of %u*%u*uint8_t bytes for cuda_base_image is failed", ps->width, ps->height);
243
return DICT_ERROR_CUDA_MALLOC;
246
cuda_err = cudaMalloc((void**)&ps->cuda_image, image_size * sizeof(uint8_t));
248
reportError("Device memory allocation of %u*%u*uint8_t bytes for cuda_image is failed", ps->width, ps->height);
250
return DICT_ERROR_CUDA_MALLOC;
188
257
void pstateFree(TProcessingState *ps) {
273
static inline int fftLoadBaseImage(TProcessingState *ps, const unsigned char *fullimg) {
275
int width = ps->width;
276
int height = ps->height;
280
int real_width, real_height;
282
float minx, miny, maxx, maxy;
284
int precision = ps->precision;
286
int half_size = 2 * ps->corr_size;
287
int size = 2 * half_size + 1;
290
int ncp_alloc = ps->ncp_alloc_size;
292
uint8_t *banlist = ps->banlist;
294
float *data_x = ps->points;
295
float *data_y = data_x + ncp_alloc;
297
float *frac_x = ps->points + 4 * ncp_alloc;
298
float *frac_y = frac_x + ncp_alloc;
300
unsigned char *cuda_base_image = ps->cuda_base_image;
308
for (i = 0;i < ncp;i++) {
309
float x = data_x[i] - 1;
310
float y = data_y[i] - 1;
312
frac_x[i] = x - round(x * precision) / precision;
313
frac_y[i] = y - round(y * precision) / precision;
315
int xstart = roundf(x) - half_size;
316
int ystart = roundf(y) - half_size;
318
int xend = xstart + size;
319
int yend = ystart + size;
321
if ((xstart < 0)||(ystart < 0)||(xend > (width - 1))||(yend > (height - 1))) {
325
if (xstart < minx) minx = xstart;
326
if (ystart < miny) miny = ystart;
327
if (xend > maxx) maxx = xend;
328
if (yend > maxy) maxy = yend;
333
ps->base_minx = minx;
334
ps->base_maxx = maxx;
335
ps->base_miny = miny;
336
ps->base_maxy = maxy;
338
// Correcting difference of area size between base and data images
339
ps->minx = minx + ps->corr_size;
340
ps->maxx = maxx - ps->corr_size;
341
ps->miny = miny + ps->corr_size;
342
ps->maxy = maxy - ps->corr_size;
344
if (ps->matlab_mode) {
345
offset = minx * height + miny;
346
real_width = ceil(maxy) - floor(miny) + 1;
347
real_height = ceil(maxx) - floor(minx) + 1;
349
matlab_width = height;
351
offset = miny * width + minx;
352
real_width = ceil(maxx) - floor(minx) + 1;
353
real_height = ceil(maxy) - floor(miny) + 1;
355
matlab_width = width;
358
if ((ps->use_cache)||(real_width * real_height < ps->ncp * size * size)) {
366
cuda_base_image + offset, matlab_width * sizeof(uint8_t),
367
fullimg + offset, matlab_width * sizeof(uint8_t),
368
real_width * sizeof(uint8_t), real_height,
369
cudaMemcpyHostToDevice
371
// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
204
378
static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
205
382
int width = ps->width;
206
383
int height = ps->height;
208
int check_mode = ((ps->base_mode)&&(!ps->mode));
209
float minx, miny, maxx, maxy;
211
int precision = ps->precision;
385
int image_mode = ps->base_mode;
213
387
int half_size = 2 * ps->corr_size;
214
388
int size = 2 * half_size + 1;
225
399
float *data_x = ps->points + icp;
226
400
float *data_y = data_x + ncp_alloc;
228
float *frac_x = ps->points + 4 * ncp_alloc + icp;
229
float *frac_y = frac_x + ncp_alloc;
231
uint8_t *img = ps->input_buffer;
402
unsigned char *cuda_base_image = ps->cuda_base_image;
233
404
float *lsum_temp = (float*)ps->cuda_lsum_temp;
234
405
int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
243
407
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
244
408
cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
245
cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;
246
float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
247
float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;
410
int cache_icp = ps->use_cache?icp:0;
411
cufftComplex *cache = ps->cuda_fft_cache + cache_icp * alloc_size;
412
float *lsum_cache = ps->cuda_lsum_cache + cache_icp * alloc_size;
413
float *denom_cache = ps->cuda_denom_cache + cache_icp * alloc_size;
249
415
int blocks = calc_blocks(size, BLOCK_SIZE_1D);
250
416
int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
252
418
int lsum_size = ps->lsum_size;
253
419
int lsum_alloc = ps->lsum_alloc_size;
256
423
for (int i = 0;i < ncp;i++) {
257
float x = data_x[i] - 1;
258
float y = data_y[i] - 1;
260
frac_x[i] = x - round(x * precision) / precision;
261
frac_y[i] = y - round(y * precision) / precision;
263
int xstart = roundf(x) - half_size;
264
int ystart = roundf(y) - half_size;
266
int xend = xstart + size;
267
int yend = xstart + size;
269
if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
274
if (xstart < minx) minx = xstart;
275
if (ystart < miny) miny = ystart;
276
if (xend > maxx) maxx = xend;
277
if (yend > maxy) maxy = yend;
424
if (banlist[i]) continue;
426
xstart = roundf(data_x[i]) - half_size - 1;
427
ystart = roundf(data_y[i]) - half_size - 1;
429
if (ps->matlab_mode) {
430
offset = xstart * height + ystart;
431
matlab_width = height;
433
offset = ystart * width + xstart;
434
matlab_width = width;
439
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
440
cuda_base_image + offset, matlab_width * sizeof(uint8_t),
441
size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice
445
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
446
fullimg + offset, matlab_width * sizeof(uint8_t),
447
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
452
uint8_t *img = ps->input_buffer;
280
454
if (ps->matlab_mode) {
304
478
img + i * alloc_size, size * sizeof(uint8_t),
305
479
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
312
484
if (ps->base_blocks_power < 0) {
313
485
vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(
314
cuda_input_buffer + j * side_alloc2, side_alloc,
315
cuda_base_buffer + j*alloc_size, fft_real_size,
486
cuda_input_buffer + i * side_alloc2, side_alloc,
487
cuda_base_buffer + i * alloc_size, fft_real_size,
316
488
lsum_temp + lsum_size * (lsum_alloc + 1),
317
489
lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
332
504
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
334
lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
506
lsum_cache + i * alloc_size, denom_cache + i * alloc_size,
335
507
lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
336
lsum_temp, lsum_temp + lsum_step, NULL);
338
// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
342
for (int j = 0;j < ncp;j++) {
343
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
508
lsum_temp, lsum_temp + lsum_step);
510
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + i * alloc_size, cache + i * alloc_size);
516
static inline int fftLoadImage(TProcessingState *ps, const unsigned char *fullimg) {
518
int ncp_alloc = ps->ncp_alloc_size;
520
int width = ps->width;
521
int height = ps->height;
523
int half_size = ps->corr_size;
524
int size = 2 * half_size + 1;
528
int real_width, real_height;
529
int xstart, ystart, xend, yend;
531
float minx = width - 1;
533
float miny = height - 1;
536
uint8_t *banlist = ps->banlist;
537
unsigned char *cuda_image = ps->cuda_image;
539
float *data_x, *data_y;
545
data_x = ps->points + 2 * ncp_alloc;
546
data_y = data_x + ncp_alloc;
549
for (int i = 0;i < ncp;i++) {
550
float x = data_x[i] - 1;
551
float y = data_y[i] - 1;
553
xstart = roundf(x) - half_size;
554
ystart = roundf(y) - half_size;
556
xend = xstart + size;
557
yend = ystart + size;
559
if (xstart < minx) minx = xstart;
560
if (ystart < miny) miny = ystart;
561
if (xend > maxx) maxx = xend;
562
if (yend > maxy) maxy = yend;
564
if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend > (width-1))||(yend > (height-1))) {
574
if (ps->matlab_mode) {
575
offset = minx * height + miny;
576
real_width = ceil(maxy) - floor(miny) + 1;
577
real_height = ceil(maxx) - floor(minx) + 1;
579
matlab_width = height;
581
offset = miny * width + minx;
582
real_width = ceil(maxx) - floor(minx) + 1;
583
real_height = ceil(maxy) - floor(miny) + 1;
585
matlab_width = width;
588
if (real_width * real_height < ps->ncp * size * size) {
597
cuda_image + offset, matlab_width * sizeof(uint8_t),
598
fullimg + offset, matlab_width * sizeof(uint8_t),
599
real_width * sizeof(uint8_t), real_height,
600
cudaMemcpyHostToDevice
602
// cudaMemcpy(cuda_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
358
static inline int fftCopyFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
608
static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
609
int image_mode = ps->mode;
359
614
int width = ps->width;
360
615
int height = ps->height;
383
644
int xstart = roundf(x) - half_size;
384
645
int ystart = roundf(y) - half_size;
386
int xend = xstart + size;
387
int yend = ystart + size;
389
if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
648
//printf("Banned: %i\n", icp + i);
394
652
if (ps->matlab_mode) {
396
img + i * size2,//alloc_size,
397
size * sizeof(uint8_t),
398
fullimg + (xstart * height + ystart),
399
height * sizeof(uint8_t),
400
size * sizeof(uint8_t),
406
img + i * size2,//alloc_size,
407
size * sizeof(uint8_t),
408
fullimg + (ystart * width + xstart),
409
width * sizeof(uint8_t),
410
size * sizeof(uint8_t),
419
static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {
421
int half_size = ps->corr_size;
422
int size = 2 * half_size + 1;
424
int side_alloc = ps->side_alloc_size;
426
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
427
uint8_t *img = ps->input_buffer;
430
for (i = 0;i < ncp;i++) {
432
cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),
433
img + i * size * size, size * sizeof(uint8_t),
434
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
653
offset = xstart * height + ystart;
654
matlab_width = height;
656
offset = ystart * width + xstart;
657
matlab_width = width;
662
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
663
cuda_image + offset, matlab_width * sizeof(uint8_t),
664
size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice
669
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
670
fullimg + offset, matlab_width * sizeof(uint8_t),
671
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
675
img + i * size2, size * sizeof(uint8_t),
676
fullimg + offset, matlab_width * sizeof(uint8_t),
677
size * sizeof(uint8_t), size,
439
cudaMemcpy3DParms copy_params = { 0 };
441
copy_params.dstPtr = make_cudaPitchedPtr(
442
cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
444
copy_params.srcPtr = make_cudaPitchedPtr(
445
img, size * sizeof(uint8_t), size, size
447
copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
448
copy_params.kind = cudaMemcpyHostToDevice;
450
cudaMemcpy3DAsync(©_params, stream0);
682
cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),
683
img + i * size2, size * sizeof(uint8_t),
684
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
694
cudaMemcpy3DParms copy_params = { 0 };
696
copy_params.dstPtr = make_cudaPitchedPtr(
697
cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
699
copy_params.srcPtr = make_cudaPitchedPtr(
700
img, size * sizeof(uint8_t), size, size
702
copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
703
copy_params.kind = cudaMemcpyHostToDevice;
705
cudaMemcpy3D(©_params);
456
713
static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
457
714
static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
526
783
float *sumbuf = ps->cuda_points + icp;
527
784
float *stdbuf = ps->cuda_points + ncp_alloc + icp;
786
int cache_icp = ps->use_cache?icp:0;
529
788
// Use real size everthere
530
789
// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
531
790
// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
532
791
// cuda_final_buffer,
533
792
// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
534
// ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
535
// ps->cuda_denom_cache + icp*alloc_size, stdbuf,
793
// ps->cuda_lsum_cache + cache_icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
794
// ps->cuda_denom_cache + cache_icp*alloc_size, stdbuf,
543
802
vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(
544
803
cuda_final_buffer, fft_size,
545
804
cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),
546
ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
547
ps->cuda_denom_cache + icp*alloc_size, stdbuf,
805
ps->cuda_lsum_cache + cache_icp * alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
806
ps->cuda_denom_cache + cache_icp * alloc_size, stdbuf,
548
807
alloc_size, fft_blocks
592
852
int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
593
853
dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
594
vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);
854
vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);
596
856
// First in-place transform for some reason is failing, therefore we
597
857
// have one alloc_size spacing between starts (see cuda_fft_buffer set above)