32
32
static int device_number = 0;
33
33
#ifdef DICT_SUPPORT_THREADS
34
static int devices[MAX_DEVICES];
34
static int devices[DICT_HW_MAX_DEVICES];
35
35
#endif /* DICT_SUPPORT_THREADS */
36
36
static size_t device_memory = 2147483648;
57
57
cudaGetDeviceProperties(&deviceProp, i);
58
58
if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {
59
59
memory = deviceProp.totalGlobalMem;
60
if (memory > 268435455) { // 256 MB
60
if (memory >= CUDA_MIN_MEMORY) {
61
61
if (memory < device_memory) device_memory = memory;
62
62
#ifdef DICT_SUPPORT_THREADS
63
63
devices[device_number++] = i;
76
76
return device_number;
80
#ifdef DICT_SUPPORT_THREADS
81
static int dictFreeThread(HWThread thr, void *hwctx, int device_id, void *data) {
82
DICTContext ctx = (DICTContext)data;
84
ProcessingState ps = ctx->pstates[device_id];
89
#endif /* DICT_SUPPORT_THREADS */
79
91
static void dictFree(DICTContext ctx) {
95
if (ctx->use_threads) {
96
#ifdef DICT_SUPPORT_THREADS
97
hw_sched_schedule_task(ctx->sched, ctx, dictFreeThread);
98
hw_sched_wait_task(ctx->sched);
99
#endif /* DICT_SUPPORT_THREADS */
101
for (i = 0; i < device_number; i++) {
102
ProcessingState ps = ctx->pstates[i];
80
108
if (ctx->image_buf) {
81
109
dictImageFree(ctx);
89
117
memset(ctx, 0, ((char*)&(ctx->matlab_mode)) - ((char*)ctx));
121
size_t dictGetMaxCacheablePoints(int corr_size, int precision, int flags) {
122
// we are a bit pessimistic here
124
int fft_size = 6 * corr_size + 1;
127
if (flags&DICT_FLAGS_FIXED_FFT_SIZE) {
128
fft_real_size = fft_size;
130
fft_real_size = next_power(fft_size);
133
int fft_alloc_size = calc_alloc(fft_real_size * fft_real_size, BLOCK_SIZE_1D);
134
int side_alloc_size = calc_alloc(fft_size, SIDE_BLOCK_SIZE);
136
int subimage_size = corr_size * 4 + 1;
137
int lsum_size = corr_size * 2 + 1;
138
int lsum_temp_size = subimage_size + 2 * lsum_size - 1;
140
int lsum_alloc_size = calc_alloc(lsum_temp_size + lsum_size, BLOCK_SIZE_2D);
143
((1 + CP_BLOCK) * fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */
144
2 * CP_BLOCK * side_alloc_size * sizeof(int32_t), /* Sum, Std computations */
145
CP_BLOCK * side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */
148
int ncp = (device_memory - CUDA_EXTRA_MEMORY - (
149
size + // cuda_temp_buffer
150
CP_BLOCK * side_alloc_size * side_alloc_size * sizeof(uint8_t) + // cuda_input_buffer
151
2 * CP_BLOCK * fft_alloc_size * sizeof(cufftReal) + // cuda_base_buffer, cuda_data_buffer
152
4 * lsum_alloc_size * lsum_alloc_size * sizeof(float) // cuda_lsum_temp
154
2 * sizeof(float) + // cuda_points
155
fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)) // caches
158
int ncp_blocks = ncp / CP_BLOCK; // thats per device
160
return (flags&DICT_FLAGS_SINGLE_THREAD)?(CP_BLOCK * ncp_blocks):(CP_BLOCK * device_number * ncp_blocks);
92
163
#ifdef DICT_SUPPORT_THREADS
93
164
static int dictInitThread(HWThread thr, void *hwctx, int device_id, void *data) {
94
165
cudaError cuda_err;
353
422
ctx->ncp_alloc_size = ncp_alloc;
354
423
ctx->ncp_per_device = per_device;
356
cuda_err = cudaHostAlloc((void**)&ctx->points, 8 * ctx->ncp_alloc_size * sizeof(float), 0);
425
cuda_err = cudaHostAlloc((void**)&ctx->points, 8 * ncp_alloc * sizeof(float), 0);
358
reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ctx->ncp_alloc_size);
427
reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ncp_alloc);
360
429
return DICT_ERROR_MALLOC;
383
452
if (ps->status) return ps->status;
385
454
ps->points = ctx->points + icp;
386
ps->points_alloc_size = ctx->ncp_alloc_size;
455
ps->points_alloc_size = ncp_alloc;
388
457
if (ps->use_cache) ++have_cache;
399
468
reportMessage("Caching is disabled for all %i devices", i + 1);
471
ctx->configured_ncp = ctx->ncp;
476
int dictSetActualNumberOfPoints(DICTContext ctx, int ncp) {
479
if (ncp > ctx->configured_ncp) {
480
reportMessage("DICT Context is configured for maximum %i control points, but %i supplied", ctx->configured_ncp, ncp);
481
return DICT_ERROR_PARAM;
484
int ncp_alloc = calc_alloc(ncp, CP_BLOCK);
489
if (ctx->use_threads) {
490
ncp_blocks = ncp_alloc / CP_BLOCK;
491
blocks_alloc = calc_alloc(ncp_blocks, device_number);
492
per_device = (blocks_alloc / device_number) * CP_BLOCK;
494
per_device = ncp_alloc;
498
ctx->ncp_alloc_size = ncp_alloc;
499
ctx->ncp_per_device = per_device;
501
for (icp = 0; icp < ncp; icp += per_device) {
502
i = icp / per_device;
503
ProcessingState ps = ctx->pstates[i];
505
ps->ncp = min2(per_device, ncp - icp);
506
ps->points = ctx->points + icp;
507
ps->points_alloc_size = ncp_alloc;
514
int dictClean(DICTContext ctx) {
515
int icp, ncp = ctx->ncp;
516
int per_device = ctx->ncp_per_device;
518
for (icp = 0; icp < ncp; icp += per_device) {
519
int i = icp / per_device;
520
ProcessingState ps = ctx->pstates[i];
521
memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
418
540
int dictSetDimensions(DICTContext ctx, int width, int height) {
421
int icp, ncp = ctx->ncp;
422
int per_device = ctx->ncp_per_device;
424
545
if ((width != ctx->width)||(height != ctx->height)) {
425
546
int image_size = width * height;
428
549
ctx->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));
429
550
if (!ctx->base_image) {
430
551
reportError("Memory allocation of %u*%u*uint8_t bytes for DICT/base_image is failed", width, height);
432
552
return DICT_ERROR_MALLOC;
435
555
ctx->width = width;
436
556
ctx->height = height;
438
for (icp = 0; icp < ncp; icp += per_device) {
439
int i = icp / per_device;
558
for (i = 0; i < device_number; i++) {
440
559
ProcessingState ps = ctx->pstates[i];
442
561
ps->width = width;
447
566
if (err) return err;
453
570
#ifdef DICT_SUPPORT_THREADS
454
if (ctx->use_threads) {
455
hw_sched_schedule_task(ctx->sched, ctx, dictSetDimensionsThread);
456
hw_sched_wait_task(ctx->sched);
571
if (ctx->use_threads) {
572
hw_sched_schedule_task(ctx->sched, ctx, dictSetDimensionsThread);
573
hw_sched_wait_task(ctx->sched);
575
for (i = 0; i < device_number; i++) {
576
ProcessingState ps = ctx->pstates[i];
460
583
#endif /* DICT_SUPPORT_THREADS */
664
788
gettimeofday(&tv1, NULL);
665
789
#endif /* DICT_HW_MEASURE_TIMINGS */
667
dictSetDimensions(ctx, width, height);
791
err = dictSetDimensions(ctx, width, height);
669
794
if (ctx->use_threads) {
670
795
#ifdef DICT_SUPPORT_THREADS