13
12
if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
14
13
if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
15
14
if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
18
16
if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
19
17
if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
37
34
cufftDestroy(ps->cufft_r2c_plan);
44
37
memset(ps, 0, ((char*)&(ps->matlab_mode)) - ((char*)ps));
47
#ifdef DICT_HW_MEASURE_TIMINGS
48
memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));
50
memset(ps, 0, sizeof(TProcessingState));
55
40
static int fftInit(TProcessingState *ps, size_t device_memory) {
117
102
memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
119
cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);
121
reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);
123
return DICT_ERROR_CUDA_MALLOC;
126
104
cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));
128
106
reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
180
158
// Counting necessary memory, here is cache memory, 64MB is considered for other needs (base and current images)
181
159
if ((ps->use_cache)&&((cache_memory + CUDA_EXTRA_MEMORY) > device_memory)) ps->use_cache = 0;
183
162
ncp_cache = ps->use_cache?ps->ncp:CP_BLOCK;
250
230
return DICT_ERROR_CUDA_MALLOC;
253
ps->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));
254
if (!ps->base_image) {
255
reportError("Memory allocation of %u*%u*uint8_t bytes for base_image is failed", ps->width, ps->height);
257
return DICT_ERROR_MALLOC;
294
268
int size = 2 * half_size + 1;
296
270
int ncp = ps->ncp;
297
int ncp_alloc = ps->ncp_alloc_size;
299
272
uint8_t *banlist = ps->banlist;
274
int points_alloc = ps->points_alloc_size;
301
276
float *data_x = ps->points;
302
float *data_y = data_x + ncp_alloc;
277
float *data_y = data_x + points_alloc;
304
float *frac_x = ps->points + 4 * ncp_alloc;
305
float *frac_y = frac_x + ncp_alloc;
279
float *frac_x = ps->points + 4 * points_alloc;
280
float *frac_y = frac_x + points_alloc;
307
282
unsigned char *cuda_base_image = ps->cuda_base_image;
376
351
cudaMemcpyHostToDevice
378
353
// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);
379
} else if (!ps->use_cache) {
380
memcpy(ps->base_image, fullimg, width * height * sizeof(uint8_t));
400
373
int fft_real_size = ps->fft_real_size;
402
int ncp_alloc = ps->ncp_alloc_size;
403
375
int alloc_size = ps->fft_alloc_size;
404
376
int side_alloc = ps->side_alloc_size;
405
377
int side_alloc2 = side_alloc * side_alloc;
407
379
uint8_t *banlist = ps->banlist + icp;
381
int points_alloc = ps->points_alloc_size;
409
382
float *data_x = ps->points + icp;
410
float *data_y = data_x + ncp_alloc;
383
float *data_y = data_x + points_alloc;
412
385
unsigned char *cuda_base_image = ps->cuda_base_image;
445
418
matlab_width = width;
423
printf("lbf0: %i %i - %i\n", xstart, ystart, fullimg[offset + size*matlab_width/2 + size/2]);
448
427
if (image_mode) {
450
429
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
517
496
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + i * alloc_size, cache + i * alloc_size);
523
503
static inline int fftLoadImage(TProcessingState *ps, const unsigned char *fullimg) {
524
504
int ncp = ps->ncp;
525
int ncp_alloc = ps->ncp_alloc_size;
527
506
int width = ps->width;
528
507
int height = ps->height;
546
525
float *data_x, *data_y;
527
int points_alloc = ps->points_alloc_size;
548
529
if (ps->stored) {
549
530
data_x = ps->res_x;
550
531
data_y = ps->res_y;
552
data_x = ps->points + 2 * ncp_alloc;
553
data_y = data_x + ncp_alloc;
533
data_x = ps->points + 2 * points_alloc;
534
data_y = data_x + points_alloc;
556
537
for (int i = 0;i < ncp;i++) {
623
604
int half_size = ps->corr_size;
624
605
int size = 2 * half_size + 1;
625
606
int size2 = size * size;
626
int ncp_alloc = ps->ncp_alloc_size;
628
608
int side_alloc = ps->side_alloc_size;
629
609
int side_alloc2 = side_alloc * side_alloc;
631
611
unsigned char *cuda_image = ps->cuda_image;
632
612
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
614
int points_alloc = ps->points_alloc_size;
634
615
float *data_x, *data_y;
635
616
if (ps->stored) {
636
617
data_x = ps->res_x + icp;
637
618
data_y = ps->res_y + icp;
639
data_x = ps->points + 2 * ncp_alloc + icp;
640
data_y = data_x + ncp_alloc;
620
data_x = ps->points + 2 * points_alloc + icp;
621
data_y = data_x + points_alloc;
643
624
uint8_t *img = ps->input_buffer;
872
853
// Therefore, it is better to use matlab mode until the computation
873
854
// code is changed (this implementation is just done to accept
874
855
// images from user apps without transposing)
856
int points_alloc = ps->points_alloc_size;
875
857
if (ps->matlab_mode) {
876
move_x = ps->points + 6 * ncp_alloc;
877
move_y = move_x + ncp_alloc;
858
move_x = ps->points + 6 * points_alloc;
859
move_y = move_x + points_alloc;
880
move_x, ncp_alloc * sizeof(float),
862
move_x, points_alloc * sizeof(float),
881
863
ps->cuda_points, ncp_alloc * sizeof(float),
882
864
ps->ncp * sizeof(float), 2,
883
865
cudaMemcpyDeviceToHost
868
printf("%f %f\n", move_x[0], move_x[1]);
886
move_y = ps->points + 6 * ncp_alloc;
887
move_x = move_y + ncp_alloc;
870
move_y = ps->points + 6 * points_alloc;
871
move_x = move_y + points_alloc;
890
move_y, ncp_alloc * sizeof(float),
874
move_y, points_alloc * sizeof(float),
891
875
ps->cuda_points, ncp_alloc * sizeof(float),
892
876
ps->ncp * sizeof(float), 2,
893
877
cudaMemcpyDeviceToHost
918
float *frac_x = ps->points + 4 * ncp_alloc;
919
float *frac_y = frac_x + ncp_alloc;
902
float *frac_x = ps->points + 4 * points_alloc;
903
float *frac_y = frac_x + points_alloc;
920
904
uint8_t *banlist = ps->banlist;