98
107
print_timing(" Setting Current Points", ctx->time[3]);
99
108
print_timing(" Loading Template Image", ctx->time[12]);
100
109
print_timing(" Reducting Template Image", ctx->time[13]);
101
print_timing(" Processing Template Image", ctx->time[4]);
110
if (ctx->use_cache) {
111
print_timing(" Processing Template Image", ctx->time[6]);
102
113
print_timing("Loading Images", load_time);
103
114
print_timing(" Load", ctx->time[14]);
104
115
print_timing(" Reduction", ctx->time[15]);
105
116
print_timing("Processing Images", process_time);
106
print_timing(" Copy Fragment", ctx->time[6]);
117
if (!ctx->use_cache) {
118
print_timing(" Processing Template Image", ctx->time[6]);
120
print_timing(" Copy Image ", ctx->time[4]);
107
121
print_timing(" Load Fragment", ctx->time[7]);
108
122
print_timing(" Preprocessing", ctx->time[8]);
109
123
print_timing(" FFT", ctx->time[9]);
173
187
base_blocks = calc_blocks(4 * ps->corr_size + 1, BLOCK_SIZE_1D);
174
188
ps->base_blocks_power = get_power(base_blocks);
191
err = fftInit(ps, device_memory);
192
if ((err == DICT_ERROR_CUDA_MALLOC)&&(ps->use_cache)) {
194
err = fftInit(ps, device_memory);
178
197
#ifdef DICT_HW_MEASURE_TIMINGS
179
198
gettimeofday(&tv2, NULL);
180
199
ps->time[1] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
181
200
#endif /* DICT_HW_MEASURE_TIMINGS */
203
reportMessage("Caching is enabled");
205
reportMessage("Caching is disabled");
291
319
int dictLoadTemplateFragment(DICTContext ps, int icp, int ncp, const unsigned char *img) {
294
#ifdef DICT_HW_MEASURE_TIMINGS
295
struct timeval tv1, tv2;
296
gettimeofday(&tv1, NULL);
297
#endif /* DICT_HW_MEASURE_TIMINGS */
299
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
301
#ifdef DICT_HW_MEASURE_TIMINGS
302
gettimeofday(&tv2, NULL);
303
ps->time[4] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
304
#endif /* DICT_HW_MEASURE_TIMINGS */
322
err = fftLoadBaseImage(ps, img);
323
if (!err) err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
323
347
int base_size = 4 * ps->corr_size + 1;
324
348
int base_size2 = base_size * base_size;
326
if (width * height > ps->ncp * size2) {
350
if ((ps->use_cache)&&(width * height > ps->ncp * size2)) {
332
// if not enoguh space for caching enable anyway ?
333
if (width * height > ps->ncp * base_size2) {
356
if ((ps->use_cache)&&(width * height > ps->ncp * base_size2)) {
334
357
ps->base_mode = 0;
336
359
ps->base_mode = 1;
339
ps->maxx = width - 1;
341
ps->maxy = height - 1;
345
for (int icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
346
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
350
if ((ps->base_mode)&&(!ps->mode)) {
351
// Correcting difference of area size between base and data images
352
ps->minx += ps->corr_size;
353
ps->miny += ps->corr_size;
354
ps->maxx -= ps->corr_size;
355
ps->maxy -= ps->corr_size;
357
width = ceil(ps->maxx) - floor(ps->minx);
358
height = ceil(ps->maxy) - floor(ps->miny);
360
// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);
361
if (width * height < ps->ncp * size2) {
363
err = fftLoadBaseImage(ps, img);
366
for (icp = 0; (icp < ps->ncp)&&(!err); icp+=CP_BLOCK) {
367
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
366
371
#ifdef DICT_HW_MEASURE_TIMINGS
367
372
gettimeofday(&tv2, NULL);
368
ps->time[4] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
373
ps->time[6] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
369
374
#endif /* DICT_HW_MEASURE_TIMINGS */
372
reportMessage("Running in the image mode");
379
reportMessage("Running in the image mode, caching enabled");
381
reportMessage("Running in the image mode, caching disabled");
374
reportMessage("Running in the fragment mode");
386
reportMessage("Running in the image/fragment mode, caching enabled");
388
reportMessage("Running in the image/fragment mode, caching disabled");
392
reportMessage("Running in the fragment mode, caching enabled");
394
reportMessage("Running in the fragment mode, caching disabled");
381
404
int dictLoadFragment(DICTContext ps, int icp, int ncp, const unsigned char *input) {
406
int load_base = !ps->use_cache;
383
407
cudaStream_t stream = NULL;
385
409
#ifdef DICT_HW_MEASURE_TIMINGS
388
412
gettimeofday(&tv1, NULL);
389
413
#endif /* DICT_HW_MEASURE_TIMINGS */
391
err = fftCopyFragment(ps, icp, ncp, input);
416
// We are enforcing image mode if the cache is disabled
417
err = fftLoadBaseFragment(ps, icp, ncp, NULL);
394
421
#ifdef DICT_HW_MEASURE_TIMINGS
395
422
gettimeofday(&tv2, NULL);
396
423
time[1] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
397
424
#endif /* DICT_HW_MEASURE_TIMINGS */
399
err = fftLoadFragment(ps, icp, ncp, input, stream);
426
err = fftLoadFragment(ps, icp, ncp, input);
402
429
#ifdef DICT_HW_MEASURE_TIMINGS
403
430
gettimeofday(&tv2, NULL);
404
431
time[2] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
405
432
#endif /* DICT_HW_MEASURE_TIMINGS */
407
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
434
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
410
437
#ifdef DICT_HW_MEASURE_TIMINGS
411
438
gettimeofday(&tv2, NULL);
412
439
time[3] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
413
440
#endif /* DICT_HW_MEASURE_TIMINGS */
415
err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
442
err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
418
445
#ifdef DICT_HW_MEASURE_TIMINGS
419
446
gettimeofday(&tv2, NULL);
420
447
time[4] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
421
448
#endif /* DICT_HW_MEASURE_TIMINGS */
423
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
450
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
426
453
#ifdef DICT_HW_MEASURE_TIMINGS
427
454
gettimeofday(&tv2, NULL);
448
473
int ncp = ps->ncp;
451
475
#ifdef DICT_HW_MEASURE_TIMINGS
453
476
struct timeval tv1, tv2;
454
477
gettimeofday(&tv1, NULL);
478
#endif /* DICT_HW_MEASURE_TIMINGS */
480
err = fftLoadImage(ps, img);
483
#ifdef DICT_HW_MEASURE_TIMINGS
455
484
gettimeofday(&tv2, NULL);
456
time[0] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
457
printf("Pre: %li, Nope: %li, Comp: %li, Post: %li, Copy/Load: %li, -: %li\n", time[0], time[1], time[2], time[3], time[4], time[5]);
459
cudaStream_t stream[2];
460
for (int i = 0; i < 2; ++i) {
461
cudaStreamCreate(&stream[i]);
463
for (int i = 0; i < 2; ++i) {
464
cudaStreamDestroy(stream[i]);
485
ps->time[4] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
486
#endif /* DICT_HW_MEASURE_TIMINGS */
469
488
for (int icp = 0; icp < ncp; icp+=CP_BLOCK) {
470
489
err = dictLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
471
490
if (err) return err;
475
err = fftCopyFragment(ps, icp, ncp, input);
478
err = fftLoadFragment(ps, icp, ncp, input, stream);
481
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
484
err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
487
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
494
err = fftCopyFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);
495
err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);
496
for (int i = 0; icp < ps->ncp; icp+=CP_BLOCK,i++) {
497
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
498
err = fftComputeFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
499
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
500
if (next_icp < ps->ncp) {
501
err = fftCopyFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);
502
err = fftLoadFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);
493
#ifdef DICT_HW_MEASURE_TIMINGS
494
gettimeofday(&tv2, NULL);
495
ps->time[5] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
496
#endif /* DICT_HW_MEASURE_TIMINGS */