188
271
ps->base_blocks_power = get_power(base_blocks);
190
273
ps->use_cache = 1;
275
if (!ctx->use_threads) {
276
err = fftInit(ps, device_memory);
277
if ((err == DICT_ERROR_CUDA_MALLOC)&&(ps->use_cache)) {
280
err = fftInit(ps, device_memory);
287
#ifdef DICT_SUPPORT_THREADS
288
static int dictSetupThread(HWThread thr, void *hwctx, int device_id, void *data) {
290
DICTContext ctx = (DICTContext)data;
292
ProcessingState ps = ctx->pstates[device_id];
191
294
err = fftInit(ps, device_memory);
192
295
if ((err == DICT_ERROR_CUDA_MALLOC)&&(ps->use_cache)) {
193
297
ps->use_cache = 0;
194
298
err = fftInit(ps, device_memory);
197
#ifdef DICT_HW_MEASURE_TIMINGS
198
gettimeofday(&tv2, NULL);
199
ps->time[1] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
200
#endif /* DICT_HW_MEASURE_TIMINGS */
203
reportMessage("Caching is enabled");
205
reportMessage("Caching is disabled");
211
int dictSetTemplatePoints(DICTContext ps, const float *points_x, const float *points_y) {
212
#ifdef DICT_HW_MEASURE_TIMINGS
213
struct timeval tv1, tv2;
214
gettimeofday(&tv1, NULL);
215
#endif /* DICT_HW_MEASURE_TIMINGS */
217
memcpy(ps->points, points_x, ps->ncp * sizeof(float));
218
memcpy(ps->points + ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));
220
#ifdef DICT_HW_MEASURE_TIMINGS
221
gettimeofday(&tv2, NULL);
222
ps->time[2] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
223
#endif /* DICT_HW_MEASURE_TIMINGS */
228
int dictSetDimensions(DICTContext ps, int width, int height) {
229
if ((width != ps->width)||(height != ps->height)) {
233
return fftSetupDimensions(ps);
239
int dictSetPointsBuffer(DICTContext ps, float *point_x, float *point_y) {
246
int dictSetCurrentPoints(DICTContext ps, const float *points_x, const float *points_y) {
247
#ifdef DICT_HW_MEASURE_TIMINGS
248
struct timeval tv1, tv2;
249
gettimeofday(&tv1, NULL);
250
#endif /* DICT_HW_MEASURE_TIMINGS */
252
memcpy(ps->points + 2 * ps->ncp_alloc_size, points_x, ps->ncp * sizeof(float));
253
memcpy(ps->points + 3 * ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));
255
#ifdef DICT_HW_MEASURE_TIMINGS
256
gettimeofday(&tv2, NULL);
257
ps->time[3] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
258
#endif /* DICT_HW_MEASURE_TIMINGS */
266
int dictCompute(DICTContext ps) {
269
#ifdef DICT_HW_MEASURE_TIMINGS
270
struct timeval tv1, tv2;
271
gettimeofday(&tv1, NULL);
272
#endif /* DICT_HW_MEASURE_TIMINGS */
274
err = fftGetCurrentPoints(ps);
276
#ifdef DICT_HW_MEASURE_TIMINGS
277
gettimeofday(&tv2, NULL);
278
ps->time[12] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
279
#endif /* DICT_HW_MEASURE_TIMINGS */
284
int dictGetCurrentPoints(DICTContext ps, float *res_x, float *res_y) {
287
#ifdef DICT_HW_MEASURE_TIMINGS
288
struct timeval tv1, tv2;
289
gettimeofday(&tv1, NULL);
290
#endif /* DICT_HW_MEASURE_TIMINGS */
292
err = fftGetCurrentPoints(ps);
305
#endif /* DICT_SUPPORT_THREADS */
308
int dictSetup(DICTContext ctx, int ncp, int corr_size, int precision, int flags) {
316
int ncp_alloc = calc_alloc(ncp, CP_BLOCK);
321
#ifdef DICT_HW_MEASURE_TIMINGS
322
struct timeval tv1, tv2;
323
gettimeofday(&tv1, NULL);
324
#endif /* DICT_HW_MEASURE_TIMINGS */
328
if (flags&DICT_FLAGS_MATLAB_MODE) {
329
ctx->matlab_mode = 1;
331
ctx->matlab_mode = 0;
334
#ifdef DICT_SUPPORT_THREADS
335
if (flags&DICT_FLAGS_SINGLE_THREAD) {
336
ctx->use_threads = 0;
338
ctx->use_threads = 1;
340
#endif /* DICT_SUPPORT_THREADS */
342
if (ctx->use_threads) {
343
ncp_blocks = ncp_alloc / CP_BLOCK;
344
blocks_alloc = calc_alloc(ncp_blocks, device_number);
345
per_device = (blocks_alloc / device_number) * CP_BLOCK;
347
per_device = ncp_alloc;
351
ctx->ncp_alloc_size = ncp_alloc;
352
ctx->ncp_per_device = per_device;
354
cuda_err = cudaHostAlloc((void**)&ctx->points, 8 * ctx->ncp_alloc_size * sizeof(float), 0);
356
reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ctx->ncp_alloc_size);
358
return DICT_ERROR_MALLOC;
361
for (icp = 0; icp < ncp; icp += per_device) {
362
i = icp / per_device;
363
int device_ncp = min2(per_device, ncp - icp);
364
ProcessingState ps = ctx->pstates[i];
366
err = dictSetupFFT(ctx, ps, device_ncp, corr_size, precision, flags);
370
#ifdef DICT_SUPPORT_THREADS
371
if (ctx->use_threads) {
372
hw_sched_schedule_task(ctx->sched, ctx, dictSetupThread);
373
hw_sched_wait_task(ctx->sched);
375
#endif /* DICT_SUPPORT_THREADS */
377
for (icp = 0; icp < ncp; icp += per_device) {
378
i = icp / per_device;
379
ProcessingState ps = ctx->pstates[i];
381
if (ps->status) return ps->status;
383
ps->points = ctx->points + icp;
384
ps->points_alloc_size = ctx->ncp_alloc_size;
386
if (ps->use_cache) ++have_cache;
389
#ifdef DICT_HW_MEASURE_TIMINGS
390
gettimeofday(&tv2, NULL);
391
ctx->time[1] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
392
#endif /* DICT_HW_MEASURE_TIMINGS */
395
reportMessage("Caching is enabled for %i devices of %i", have_cache, i + 1);
397
reportMessage("Caching is disabled for all %i devices", i + 1);
403
#ifdef DICT_SUPPORT_THREADS
404
static int dictSetDimensionsThread(HWThread thr, void *hwctx, int device_id, void *data) {
406
DICTContext ctx = (DICTContext)data;
407
ProcessingState ps = ctx->pstates[device_id];
409
err = fftSetupDimensions(ps);
414
#endif /* DICT_SUPPORT_THREADS */
416
int dictSetDimensions(DICTContext ctx, int width, int height) {
419
int icp, ncp = ctx->ncp;
420
int per_device = ctx->ncp_per_device;
422
if ((width != ctx->width)||(height != ctx->height)) {
423
int image_size = width * height;
425
if (ctx->base_image) free(ctx->base_image);
426
ctx->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));
427
if (!ctx->base_image) {
428
reportError("Memory allocation of %u*%u*uint8_t bytes for DICT/base_image is failed", width, height);
430
return DICT_ERROR_MALLOC;
434
ctx->height = height;
436
for (icp = 0; icp < ncp; icp += per_device) {
437
int i = icp / per_device;
438
ProcessingState ps = ctx->pstates[i];
443
if (!ctx->use_threads) {
444
err = fftSetupDimensions(ps);
451
#ifdef DICT_SUPPORT_THREADS
452
if (ctx->use_threads) {
453
hw_sched_schedule_task(ctx->sched, ctx, dictSetDimensionsThread);
454
hw_sched_wait_task(ctx->sched);
458
#endif /* DICT_SUPPORT_THREADS */
465
int dictSetTemplatePoints(DICTContext ctx, const float *points_x, const float *points_y) {
466
#ifdef DICT_HW_MEASURE_TIMINGS
467
struct timeval tv1, tv2;
468
gettimeofday(&tv1, NULL);
469
#endif /* DICT_HW_MEASURE_TIMINGS */
471
memcpy(ctx->points, points_x, ctx->ncp * sizeof(float));
472
memcpy(ctx->points + ctx->ncp_alloc_size, points_y, ctx->ncp * sizeof(float));
474
#ifdef DICT_HW_MEASURE_TIMINGS
475
gettimeofday(&tv2, NULL);
476
ctx->time[2] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
477
#endif /* DICT_HW_MEASURE_TIMINGS */
483
int dictSetPointsBuffer(DICTContext ctx, float *point_x, float *point_y) {
484
int icp, ncp = ctx->ncp;
485
int per_device = ctx->ncp_per_device;
487
ctx->res_x = point_x;
488
ctx->res_y = point_y;
490
for (icp = 0; icp < ncp; icp += per_device) {
491
int i = icp / per_device;
492
ProcessingState ps = ctx->pstates[i];
494
ps->res_x = point_x + icp;
495
ps->res_y = point_y + icp;
501
int dictSetCurrentPoints(DICTContext ctx, const float *points_x, const float *points_y) {
502
int icp, ncp = ctx->ncp;
503
int per_device = ctx->ncp_per_device;
505
#ifdef DICT_HW_MEASURE_TIMINGS
506
struct timeval tv1, tv2;
507
gettimeofday(&tv1, NULL);
508
#endif /* DICT_HW_MEASURE_TIMINGS */
510
memcpy(ctx->points + 2 * ctx->ncp_alloc_size, points_x, ctx->ncp * sizeof(float));
511
memcpy(ctx->points + 3 * ctx->ncp_alloc_size, points_y, ctx->ncp * sizeof(float));
513
#ifdef DICT_HW_MEASURE_TIMINGS
514
gettimeofday(&tv2, NULL);
515
ctx->time[3] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
516
#endif /* DICT_HW_MEASURE_TIMINGS */
518
for (icp = 0; icp < ncp; icp += per_device) {
519
int i = icp / per_device;
520
ProcessingState ps = ctx->pstates[i];
528
#ifdef DICT_SUPPORT_THREADS
529
static int dictComputeThread(HWThread thr, void *hwctx, int device_id, void *data) {
531
DICTContext ctx = (DICTContext)data;
532
ProcessingState ps = ctx->pstates[device_id];
534
err = fftGetCurrentPoints(ps);
539
#endif /* DICT_SUPPORT_THREADS */
541
int dictCompute(DICTContext ctx) {
544
int icp, ncp = ctx->ncp;
545
int per_device = ctx->ncp_per_device;
547
#ifdef DICT_HW_MEASURE_TIMINGS
548
struct timeval tv1, tv2;
549
gettimeofday(&tv1, NULL);
550
#endif /* DICT_HW_MEASURE_TIMINGS */
552
if (ctx->use_threads) {
553
#ifdef DICT_SUPPORT_THREADS
554
hw_sched_schedule_task(ctx->sched, ctx, dictComputeThread);
555
hw_sched_wait_task(ctx->sched);
558
#endif /* DICT_SUPPORT_THREADS */
560
for (icp = 0; icp < ncp; icp += per_device) {
561
int i = icp / per_device;
562
ProcessingState ps = ctx->pstates[i];
563
err = fftGetCurrentPoints(ps);
568
#ifdef DICT_HW_MEASURE_TIMINGS
569
gettimeofday(&tv2, NULL);
570
ctx->time[16] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
571
#endif /* DICT_HW_MEASURE_TIMINGS */
576
int dictGetCurrentPoints(DICTContext ctx, float *res_x, float *res_y) {
580
#ifdef DICT_HW_MEASURE_TIMINGS
581
struct timeval tv1, tv2;
582
#endif /* DICT_HW_MEASURE_TIMINGS */
584
err = dictCompute(ctx);
293
585
if (err) return err;
295
if ((res_x)&&(res_x != ps->res_x)) {
587
#ifdef DICT_HW_MEASURE_TIMINGS
588
gettimeofday(&tv1, NULL);
589
#endif /* DICT_HW_MEASURE_TIMINGS */
591
// stored is changed synchronously across devices
592
stored = ctx->pstates[0]->stored;
594
if ((res_x)&&(res_x != ctx->res_x)) {
297
if (ps->stored) data_x = ps->res_x;
298
else data_x = ps->points + 2 * ps->ncp_alloc_size;
596
if (stored) data_x = ctx->res_x;
597
else data_x = ctx->points + 2 * ctx->ncp_alloc_size;
300
memcpy(res_x, data_x, ps->ncp * sizeof(float));
599
memcpy(res_x, data_x, ctx->ncp * sizeof(float));
303
if ((res_y)&&(res_y != ps->res_y)) {
602
if ((res_y)&&(res_y != ctx->res_y)) {
305
if (ps->stored) data_y = ps->res_y;
306
else data_y = ps->points + 3 * ps->ncp_alloc_size;
604
if (stored) data_y = ctx->res_y;
605
else data_y = ctx->points + 3 * ctx->ncp_alloc_size;
308
memcpy(res_y, data_y, ps->ncp * sizeof(float));
607
memcpy(res_y, data_y, ctx->ncp * sizeof(float));
311
610
#ifdef DICT_HW_MEASURE_TIMINGS
312
611
gettimeofday(&tv2, NULL);
313
ps->time[16] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
612
ctx->time[16] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
314
613
#endif /* DICT_HW_MEASURE_TIMINGS */
319
int dictLoadTemplateFragment(DICTContext ps, int icp, int ncp, const unsigned char *img) {
619
static int dictLoadTemplateFragment(DICTContext ctx, int icp, int ncp, const unsigned char *img) {
322
622
err = fftLoadBaseImage(ps, img);
328
int dictLoadTemplateImage(DICTContext ps, const unsigned char *img, int width, int height) {
630
#ifdef DICT_SUPPORT_THREADS
631
static int dictLoadTemplateImageThread(HWThread thr, void *hwctx, int device_id, void *data) {
633
DICTContext ctx = (DICTContext)data;
634
ProcessingState ps = ctx->pstates[device_id];
638
err = fftLoadBaseImage(ps, ctx->param_img);
641
for (jcp = 0; (jcp < ps->ncp)&&(!err); jcp += CP_BLOCK) {
642
err = fftLoadBaseFragment(ps, jcp, min2(CP_BLOCK, ps->ncp - jcp), ctx->param_img);
644
} else if (!ps->base_mode) ctx->res_do_copy = 1;
650
#endif /* DICT_SUPPORT_THREADS */
652
int dictLoadTemplateImage(DICTContext ctx, const unsigned char *img, int width, int height) {
655
int icp, ncp = ctx->ncp;
656
int per_device = ctx->ncp_per_device;
333
660
#ifdef DICT_HW_MEASURE_TIMINGS
334
661
struct timeval tv1, tv2;
335
662
gettimeofday(&tv1, NULL);
336
663
#endif /* DICT_HW_MEASURE_TIMINGS */
338
dictSetDimensions(ps, width, height);
340
err = fftLoadBaseImage(ps, img);
343
for (icp = 0; (icp < ps->ncp)&&(!err); icp+=CP_BLOCK) {
344
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
665
dictSetDimensions(ctx, width, height);
667
if (ctx->use_threads) {
668
#ifdef DICT_SUPPORT_THREADS
669
ctx->res_do_copy = 0;
670
ctx->param_img = img;
672
hw_sched_schedule_task(ctx->sched, ctx, dictLoadTemplateImageThread);
673
hw_sched_wait_task(ctx->sched);
677
do_copy = ctx->res_do_copy;
678
#endif /* DICT_SUPPORT_THREADS */
680
for (icp = 0; icp < ncp; icp += per_device) {
681
int i = icp / per_device;
682
ProcessingState ps = ctx->pstates[i];
684
err = fftLoadBaseImage(ps, img);
687
for (jcp = 0; (jcp < ps->ncp)&&(!err); jcp += CP_BLOCK) {
688
err = fftLoadBaseFragment(ps, jcp, min2(CP_BLOCK, ps->ncp - jcp), img);
690
} else if (!ps->base_mode) do_copy = 1;
697
memcpy(ctx->base_image, img, width * height * sizeof(uint8_t));
348
700
#ifdef DICT_HW_MEASURE_TIMINGS
349
701
gettimeofday(&tv2, NULL);
350
ps->time[6] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
702
ctx->time[17] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
351
703
#endif /* DICT_HW_MEASURE_TIMINGS */
358
int dictLoadFragment(DICTContext ps, int icp, int ncp, const unsigned char *input) {
709
static int dictLoadFragment(DICTContext ctx, ProcessingState ps, int icp, int ncp, const unsigned char *input) {
360
711
int load_base = !ps->use_cache;
412
762
time[i] -= time[i-1];
415
for (int i = 1; i < 6; i++) {
416
ps->time[i+5] += time[i];
765
if (!ctx->use_threads) {
766
for (int i = 1; i < 6; i++) {
767
ctx->time[i+5] += time[i];
418
#endif /* DICT_HW_MEASURE_TIMINGS */
770
# endif /* DICT_HW_MEASURE_TIMINGS */
424
int dictLoadImage(DICTContext ps, unsigned char *img) {
776
#ifdef DICT_SUPPORT_THREADS
777
static int dictLoadImageThread(HWThread thr, void *hwctx, int device_id, void *data) {
428
#ifdef DICT_HW_MEASURE_TIMINGS
429
struct timeval tv1, tv2;
430
gettimeofday(&tv1, NULL);
431
#endif /* DICT_HW_MEASURE_TIMINGS */
433
err = fftLoadImage(ps, img);
436
#ifdef DICT_HW_MEASURE_TIMINGS
437
gettimeofday(&tv2, NULL);
438
ps->time[4] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
439
#endif /* DICT_HW_MEASURE_TIMINGS */
441
for (int icp = 0; icp < ncp; icp+=CP_BLOCK) {
442
err = dictLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
779
DICTContext ctx = (DICTContext)data;
780
ProcessingState ps = ctx->pstates[device_id];
784
err = fftLoadImage(ps, ctx->param_img);
786
for (jcp = 0; (jcp < ps->ncp)&&(!err); jcp += CP_BLOCK) {
787
err = dictLoadFragment(ctx, ps, jcp, min2(CP_BLOCK, ps->ncp - jcp), ctx->param_img);
446
#ifdef DICT_HW_MEASURE_TIMINGS
447
gettimeofday(&tv2, NULL);
448
ps->time[5] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
449
#endif /* DICT_HW_MEASURE_TIMINGS */
455
int dictProcessImage(DICTContext ps, unsigned char *img) {
456
int err = dictLoadImage(ps, img);
458
return dictCompute(ps);
461
int dictLoadTemplateImageFile(DICTContext ps, const char *name) {
462
int err = dictImageLoadTemplateImage(ps, name);
464
return dictLoadTemplateImage(ps, ps->image, ps->width, ps->height);
466
int dictLoadImageFile(DICTContext ps, const char *name) {
467
int err = dictImageLoadImage(ps, name);
469
return dictLoadImage(ps, ps->image);
472
int dictProcessImageFile(DICTContext ps, const char *name) {
473
int err = dictLoadImageFile(ps, name);
475
return dictCompute(ps);
794
#endif /* DICT_SUPPORT_THREADS */
798
int dictLoadImage(DICTContext ctx, const unsigned char *img) {
801
int icp, ncp = ctx->ncp;
802
int per_device = ctx->ncp_per_device;
804
if (ctx->use_threads) {
805
#ifdef DICT_SUPPORT_THREADS
806
# ifdef DICT_HW_MEASURE_TIMINGS
807
struct timeval tv1, tv2;
808
gettimeofday(&tv1, NULL);
809
# endif /* DICT_HW_MEASURE_TIMINGS */
811
ctx->param_img = img;
813
hw_sched_schedule_task(ctx->sched, ctx, dictLoadImageThread);
814
hw_sched_wait_task(ctx->sched);
817
# ifdef DICT_HW_MEASURE_TIMINGS
818
gettimeofday(&tv2, NULL);
819
ctx->time[5] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
820
# endif /* DICT_HW_MEASURE_TIMINGS */
821
#endif /* DICT_SUPPORT_THREADS */
823
for (icp = 0; icp < ncp; icp += per_device) {
824
int i = icp / per_device;
825
ProcessingState ps = ctx->pstates[i];
827
#ifdef DICT_HW_MEASURE_TIMINGS
828
struct timeval tv1, tv2;
829
gettimeofday(&tv1, NULL);
830
#endif /* DICT_HW_MEASURE_TIMINGS */
832
err = fftLoadImage(ps, img);
834
#ifdef DICT_HW_MEASURE_TIMINGS
835
gettimeofday(&tv2, NULL);
836
ctx->time[4] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
837
#endif /* DICT_HW_MEASURE_TIMINGS */
839
for (int jcp = 0; (jcp < ps->ncp)&&(!err); jcp += CP_BLOCK) {
840
err = dictLoadFragment(ctx, ps, jcp, min2(CP_BLOCK, ps->ncp - jcp), img);
843
#ifdef DICT_HW_MEASURE_TIMINGS
844
gettimeofday(&tv2, NULL);
845
ctx->time[5] += (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
846
#endif /* DICT_HW_MEASURE_TIMINGS */
853
int dictLoadTemplateImageFile(DICTContext ctx, const char *name) {
854
int err = dictImageLoadTemplateImage(ctx, name);
856
return dictLoadTemplateImage(ctx, ctx->image, ctx->width, ctx->height);
859
int dictLoadImageFile(DICTContext ctx, const char *name) {
860
int err = dictImageLoadImage(ctx, name);
862
return dictLoadImage(ctx, ctx->image);
865
int dictProcessImage(DICTContext ctx, const unsigned char *img) {
866
int err = dictLoadImage(ctx, img);
868
return dictCompute(ctx);
871
int dictProcessImageFile(DICTContext ctx, const char *name) {
872
int err = dictLoadImageFile(ctx, name);
874
return dictCompute(ctx);