9
#include "local_sum.cu.h"
11
#include "normxcorr_hw.h"
12
#include "normxcorr_hw.cu.h"
14
#define MAX_DEVICES 16
16
static int device_number = 0;
17
static int devices[MAX_DEVICES];
19
int dictDetectHardware() {
21
cudaDeviceProp deviceProp;
23
cudaGetDeviceCount(&deviceCount);
24
if (!deviceCount) return -1;
26
for (int i = 0; i < deviceCount; i++) {
27
cudaGetDeviceProperties(&deviceProp, i);
28
if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {
29
devices[device_number++] = i;
36
DICTContext dictCreateContext() {
38
if (dictDetectHardware() <= 0) return NULL;
41
TProcessingState *pstate = pstateInit();
45
void dictDestroyContext(DICTContext ctx) {
49
int dictSetLogger(DICTLogger error_reporter, DICTLogger message_writer) {
50
reportError = error_reporter;
51
reportMessage = message_writer;
55
int dictSetup(DICTContext ps, int ncp, int corr_size, int precision, DICTFlags flags) {
56
int base_blocks, side_blocks;
61
ps->corr_size = corr_size;
62
ps->precision = precision;
64
ps->subimage_size = ps->corr_size * 4 + 1;
65
ps->fft_size = 6 * corr_size + 1;
67
if (flags&DICT_FLAGS_FIXED_FFT_SIZE) {
68
ps->fft_real_size = ps->fft_size;
70
ps->fft_real_size = next_power(ps->fft_size);
73
ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);
74
ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);
76
ps->fft_alloc_size = calc_alloc(ps->fft_real_size * ps->fft_real_size, BLOCK_SIZE_1D);
78
ps->lsum_size = ps->corr_size * 2 + 1;
79
ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
81
ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);
82
ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);
83
ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);
85
side_blocks = calc_blocks(2 * ps->corr_size + 1, SIDE_BLOCK_SIZE);
86
ps->side_blocks_power = get_power(side_blocks);
88
base_blocks = calc_blocks(4 * ps->corr_size + 1, BLOCK_SIZE_1D);
89
ps->base_blocks_power = get_power(base_blocks);
94
int dictSetTemplatePoints(DICTContext ps, const float *points_x, const float *points_y) {
95
memcpy(ps->points, points_x, ps->ncp * sizeof(float));
96
memcpy(ps->points + ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));
101
int dictSetDimensions(DICTContext ps, int width, int height) {
108
int dictSetPointsBuffer(DICTContext ps, float *point_x, float *point_y) {
115
int dictSetCurrentPoints(DICTContext ps, const float *points_x, const float *points_y) {
116
memcpy(ps->points + 2 * ps->ncp_alloc_size, points_x, ps->ncp * sizeof(float));
117
memcpy(ps->points + 3 * ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));
124
int dictCompute(DICTContext ps) {
125
return fftGetCurrentPoints(ps);
128
int dictGetCurrentPoints(DICTContext ps, float *res_x, float *res_y) {
131
err = fftGetCurrentPoints(ps);
134
if ((res_x)&&(res_x != ps->res_x)) {
136
if (ps->stored) data_x = ps->res_x;
137
else data_x = ps->points + 2 * ps->ncp_alloc_size;
139
memcpy(res_x, data_x, ps->ncp * sizeof(float));
142
if ((res_y)&&(res_y != ps->res_y)) {
144
if (ps->stored) data_y = ps->res_y;
145
else data_y = ps->points + 3 * ps->ncp_alloc_size;
147
memcpy(res_y, data_y, ps->ncp * sizeof(float));
154
int dictLoadTemplateFragment(DICTContext ps, int icp, int ncp, const unsigned char *img) {
155
return fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
158
int dictLoadTemplateImage(DICTContext ps, const unsigned char *img, int width, int height) {
164
int size = 2 * ps->corr_size + 1;
165
int size2 = size * size;
167
int base_size = 4 * ps->corr_size + 1;
168
int base_size2 = base_size * base_size;
170
if (width * height > ps->ncp * size2) {
176
// if not enoguh space for caching enable anyway ?
177
if (width * height > ps->ncp * base_size2) {
183
ps->maxx = width - 1;
185
ps->maxy = height - 1;
189
for (int icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
190
err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
194
if ((ps->base_mode)&&(!ps->mode)) {
195
// printf("%ux%u\n", width, height);
197
// Correcting difference of area size between base and data images
198
ps->minx += ps->corr_size;
199
ps->miny += ps->corr_size;
200
ps->maxx -= ps->corr_size;
201
ps->maxy -= ps->corr_size;
203
width = ceil(ps->maxx) - floor(ps->minx);
204
height = ceil(ps->maxy) - floor(ps->miny);
206
// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);
207
if (width * height < ps->ncp * size2) {
213
reportMessage("Running in the image mode");
215
reportMessage("Running in the fragment mode");
222
int dictLoadFragment(DICTContext ps, int icp, int ncp, const unsigned char *input) {
224
cudaStream_t stream = NULL;
227
err = fftCopyFragment(ps, icp, ncp, input);
230
err = fftLoadFragment(ps, icp, ncp, input, stream);
233
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
236
err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
239
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
245
int dictLoadImage(DICTContext ps, unsigned char *img) {
250
#ifdef DICT_HW_MEASURE_TIMINGS
252
struct timeval tv1, tv2;
253
gettimeofday(&tv1, NULL);
254
gettimeofday(&tv2, NULL);
255
time[0] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);
256
printf("Pre: %li, Nope: %li, Comp: %li, Post: %li, Copy/Load: %li, -: %li\n", time[0], time[1], time[2], time[3], time[4], time[5]);
258
cudaStream_t stream[2];
259
for (int i = 0; i < 2; ++i) {
260
cudaStreamCreate(&stream[i]);
262
for (int i = 0; i < 2; ++i) {
263
cudaStreamDestroy(stream[i]);
268
for (int icp = 0; icp < ncp; icp+=CP_BLOCK) {
269
err = dictLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);
274
err = fftCopyFragment(ps, icp, ncp, input);
277
err = fftLoadFragment(ps, icp, ncp, input, stream);
280
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
283
err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
286
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);
293
err = fftCopyFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);
294
err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);
295
for (int i = 0; icp < ps->ncp; icp+=CP_BLOCK,i++) {
296
err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
297
err = fftComputeFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
298
err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);
299
if (next_icp < ps->ncp) {
300
err = fftCopyFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);
301
err = fftLoadFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);
310
int dictProcessImage(DICTContext ps, unsigned char *img) {
311
int err = dictLoadImage(ps, img);
313
return dictCompute(ps);