/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
1
#include "normxcorr_hw.h"
1 by Suren A. Chilingaryan
Initial import
2
#include "normxcorr_hw_msg.h"
19 by Suren A. Chilingaryan
Provide stand-alone library
3
#include "normxcorr_hw_kernel.cu.h"
4
1 by Suren A. Chilingaryan
Initial import
5
6
static void fftFree(TProcessingState *ps) {
10 by Suren A. Chilingaryan
Block computational kernels
7
    if (ps->banlist) free(ps->banlist);
8
    if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
9
	
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
10
    if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
11
    if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
10 by Suren A. Chilingaryan
Block computational kernels
12
    if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
1 by Suren A. Chilingaryan
Initial import
13
    
10 by Suren A. Chilingaryan
Block computational kernels
14
    if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
15
    if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
16
	
17
    if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
18
    if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
19
    if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
20
	
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
21
    if (ps->cuda_points) cudaFree(ps->cuda_points);
10 by Suren A. Chilingaryan
Block computational kernels
22
    if (ps->points) cudaFreeHost(ps->points);
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
23
24
    if (ps->cudpp_initialized) {
25
	cudppDestroyPlan(ps->cudpp_plan);
26
    }
27
28
    if (ps->fft_initialized) {
29
	cufftDestroy(ps->cufft_r2c_plan);
30
	cufftDestroy(ps->cufft_c2r_plan);
31
    }
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
32
    
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
33
    if (ps->image_buf) {
34
	dictImageFree(ps);
35
    }
25 by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups
36
37
#ifdef DICT_HW_MEASURE_TIMINGS
38
    memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));
39
#else  /* DICT_HW_MEASURE_TIMINGS */
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
40
    memset(ps, 0, sizeof(TProcessingState));
25 by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups
41
#endif /* DICT_HW_MEASURE_TIMINGS */
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
42
1 by Suren A. Chilingaryan
Initial import
43
}
44
3 by Suren A. Chilingaryan
Add error checking to initialization process
45
static int fftInit(TProcessingState *ps) {
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
46
    CUDPPConfiguration cudpp_config;
47
    
48
    CUDPPResult cudpp_err;
3 by Suren A. Chilingaryan
Add error checking to initialization process
49
    cufftResult cufft_err;
50
    cudaError cuda_err;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
51
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
52
    int size;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
53
    int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
54
    int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;
3 by Suren A. Chilingaryan
Add error checking to initialization process
55
    
1 by Suren A. Chilingaryan
Initial import
56
16 by Suren A. Chilingaryan
Optimize FFT size
57
    cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);
3 by Suren A. Chilingaryan
Add error checking to initialization process
58
    if (cufft_err) {
59
	reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);
19 by Suren A. Chilingaryan
Provide stand-alone library
60
	return DICT_ERROR_CUFFT;
3 by Suren A. Chilingaryan
Add error checking to initialization process
61
    }	
62
    
16 by Suren A. Chilingaryan
Optimize FFT size
63
    cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_C2R);
3 by Suren A. Chilingaryan
Add error checking to initialization process
64
    if (cufft_err) {
65
	reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);
66
	cufftDestroy(ps->cufft_r2c_plan);
19 by Suren A. Chilingaryan
Provide stand-alone library
67
	return DICT_ERROR_CUFFT;
3 by Suren A. Chilingaryan
Add error checking to initialization process
68
    }
1 by Suren A. Chilingaryan
Initial import
69
70
    ps->fft_initialized = true;
71
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
72
    cudpp_config.algorithm = CUDPP_SCAN;
73
    cudpp_config.options = CUDPP_OPTION_FORWARD |  CUDPP_OPTION_INCLUSIVE;
74
    cudpp_config.op = CUDPP_ADD;
75
    cudpp_config.datatype = CUDPP_FLOAT;
76
77
    cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
78
    if (cudpp_err != CUDPP_SUCCESS) {
79
	reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
80
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
81
	return DICT_ERROR_CUDPP;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
82
    }
83
    
84
    ps->cudpp_initialized = true;
85
10 by Suren A. Chilingaryan
Block computational kernels
86
    cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
3 by Suren A. Chilingaryan
Add error checking to initialization process
87
    if (cuda_err) {
10 by Suren A. Chilingaryan
Block computational kernels
88
	reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
89
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
90
	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan
Add error checking to initialization process
91
    }
92
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
93
7 by Suren A. Chilingaryan
FindPeak optimization
94
    size = max3(
10 by Suren A. Chilingaryan
Block computational kernels
95
	(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex),		/* FFT multiplication */
7 by Suren A. Chilingaryan
FindPeak optimization
96
	2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t),			/* Sum, Std computations */
12 by Suren A. Chilingaryan
Reduce memory allocation
97
	CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float))	/* Max of correlation */
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
98
    );
99
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
100
    cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
101
    if (cuda_err) {
12 by Suren A. Chilingaryan
Reduce memory allocation
102
	reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
103
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
104
	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan
Add error checking to initialization process
105
    }
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
106
    
107
    ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));
108
    if (!ps->banlist) {
12 by Suren A. Chilingaryan
Reduce memory allocation
109
	reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
110
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
111
	return DICT_ERROR_MALLOC;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
112
    }
113
    memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
114
    
115
    cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
116
    if (cuda_err) {
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
117
	reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
118
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
119
	return DICT_ERROR_CUDA_MALLOC;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
120
    }
121
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
122
    cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
123
    if (cuda_err) {
12 by Suren A. Chilingaryan
Reduce memory allocation
124
	reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
125
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
126
	return DICT_ERROR_CUDA_MALLOC;
12 by Suren A. Chilingaryan
Reduce memory allocation
127
    }
128
129
    cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));
130
    if (cuda_err) {
131
	reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);
132
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
133
	return DICT_ERROR_CUDA_MALLOC;
12 by Suren A. Chilingaryan
Reduce memory allocation
134
    }
135
136
    cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
137
    if (cuda_err) {
138
	reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
139
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
140
	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan
Add error checking to initialization process
141
    }
23 by Suren A. Chilingaryan
Print timings using reportMessage calls
142
	
143
	// DS: We don't actually need that to be CP_BLOCK, just unblock computations in loadbase and set to single
15 by Suren A. Chilingaryan
First attempt with CUDA streams
144
    cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
145
    if (cuda_err) {
146
	reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);
147
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
148
	return DICT_ERROR_CUDA_MALLOC;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
149
    }
15 by Suren A. Chilingaryan
First attempt with CUDA streams
150
    cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
151
10 by Suren A. Chilingaryan
Block computational kernels
152
    cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
153
    if (cuda_err) {
10 by Suren A. Chilingaryan
Block computational kernels
154
	reportError("Device memory allocation of %u*%u*cufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
155
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
156
	return DICT_ERROR_CUDA_MALLOC;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
157
    }
10 by Suren A. Chilingaryan
Block computational kernels
158
    cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
1 by Suren A. Chilingaryan
Initial import
159
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
160
    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
3 by Suren A. Chilingaryan
Add error checking to initialization process
161
    if (cuda_err) {
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
162
	reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
163
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
164
	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan
Add error checking to initialization process
165
    }
166
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
167
    cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
3 by Suren A. Chilingaryan
Add error checking to initialization process
168
    if (cuda_err) {
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
169
	reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan
Add error checking to initialization process
170
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
171
	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan
Add error checking to initialization process
172
    }
173
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
174
    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2  * sizeof(float));
175
    if (cuda_err) {
176
	reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);
177
	fftFree(ps);
19 by Suren A. Chilingaryan
Provide stand-alone library
178
	return DICT_ERROR_MALLOC;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
179
    }
180
	// We need to zero temporary buffers as well, since we are not computing
181
	// cumsum of complete matrix, but non-zero part of it
182
    cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));
19 by Suren A. Chilingaryan
Provide stand-alone library
183
        
3 by Suren A. Chilingaryan
Add error checking to initialization process
184
    return 0;
1 by Suren A. Chilingaryan
Initial import
185
}
186
187
188
void pstateFree(TProcessingState *ps) {
189
    if (ps) {
190
	fftFree(ps);
191
	free(ps);
192
    }
193
}
194
195
TProcessingState *pstateInit() {
196
    TProcessingState *ps;
197
    
198
    ps = (TProcessingState*)malloc(sizeof(TProcessingState));
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
199
    if (ps) memset(ps, 0, sizeof(TProcessingState));
19 by Suren A. Chilingaryan
Provide stand-alone library
200
1 by Suren A. Chilingaryan
Initial import
201
    return ps;
202
}
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
203
19 by Suren A. Chilingaryan
Provide stand-alone library
204
static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
205
    int width = ps->width;
206
    int height = ps->height;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
207
208
    int check_mode = ((ps->base_mode)&&(!ps->mode));
209
    float minx, miny, maxx, maxy;
210
211
    int precision = ps->precision;
212
213
    int half_size = 2 * ps->corr_size;
214
    int size = 2 * half_size + 1;
215
16 by Suren A. Chilingaryan
Optimize FFT size
216
    int fft_real_size = ps->fft_real_size;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
217
    
218
    int ncp_alloc = ps->ncp_alloc_size;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
219
    int alloc_size = ps->fft_alloc_size;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
220
    int side_alloc = ps->side_alloc_size;
221
    int side_alloc2 = side_alloc * side_alloc;
222
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
223
    uint8_t *banlist = ps->banlist + icp;
224
    
225
    float *data_x = ps->points + icp;
226
    float *data_y = data_x + ncp_alloc;
227
228
    float *frac_x = ps->points + 4 * ncp_alloc + icp;
229
    float *frac_y = frac_x + ncp_alloc;
230
231
    uint8_t *img = ps->input_buffer;
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
232
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
233
    float *lsum_temp = (float*)ps->cuda_lsum_temp;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
234
    int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
235
236
    if (check_mode) {
237
	minx = ps->minx;
238
	maxx = ps->maxx;
239
	miny = ps->miny;
240
	maxy = ps->maxy;
241
    }
242
    
12 by Suren A. Chilingaryan
Reduce memory allocation
243
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
244
    cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
245
    cufftComplex *cache = ps->cuda_fft_cache +  icp * alloc_size;
246
    float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
247
    float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;
248
13 by Suren A. Chilingaryan
Eleminate last kernel with NxN geometr
249
    int blocks = calc_blocks(size, BLOCK_SIZE_1D);
250
    int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
251
    
252
    int lsum_size = ps->lsum_size;
253
    int lsum_alloc = ps->lsum_alloc_size;
254
15 by Suren A. Chilingaryan
First attempt with CUDA streams
255
    cudaStream_t stream[2];
256
    for (int i = 0; i < 2; ++i) {
257
	cudaStreamCreate(&stream[i]);
258
    }
259
260
    for (int i = 0;i <= ncp;i++) {
261
      if (i < ncp) {
262
	float x = data_x[i] - 1;
263
	float y = data_y[i] - 1;
264
265
	frac_x[i] = x - round(x * precision) / precision;
266
	frac_y[i] = y - round(y * precision) / precision;
267
    
268
	int xstart = roundf(x) - half_size;
269
	int ystart = roundf(y) - half_size;
270
    
271
	int xend = xstart + size;
272
	int yend = xstart + size;
273
274
	if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
275
	    continue;
276
	}
277
	
278
	if (check_mode) {
279
	    if (xstart < minx) minx = xstart;
280
	    if (ystart < miny) miny = ystart;
281
	    if (xend > maxx) maxx = xend;
282
	    if (yend > maxy) maxy = yend;
283
	}
284
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
285
	if (ps->matlab_mode) {
286
	    cudaMemcpy2D(
287
		img + i * alloc_size,
288
		size * sizeof(uint8_t),
289
		fullimg + (xstart * height + ystart),
290
		height * sizeof(uint8_t),
291
		size * sizeof(uint8_t),
292
		size,
293
		cudaMemcpyHostToHost
294
	    );
295
	} else {
296
	    cudaMemcpy2D(
297
		img + i * alloc_size,
298
	        size * sizeof(uint8_t),
299
	        fullimg + (ystart * width + xstart),
300
		width * sizeof(uint8_t),
301
	        size * sizeof(uint8_t),
302
		size,
303
	        cudaMemcpyHostToHost
304
	    );
305
	}
15 by Suren A. Chilingaryan
First attempt with CUDA streams
306
	
307
	cudaMemcpy2DAsync(
308
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
309
	    img + i * alloc_size, size * sizeof(uint8_t),
310
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,
311
	    stream[i%2]
312
	);
313
314
	banlist[i] = 0;
315
      }
316
      if (i > 0) {
317
        int j = i - 1;
12 by Suren A. Chilingaryan
Reduce memory allocation
318
	
17 by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2
319
	if (ps->base_blocks_power < 0) {
15 by Suren A. Chilingaryan
First attempt with CUDA streams
320
	    vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(
321
		cuda_input_buffer + j * side_alloc2, side_alloc, 
16 by Suren A. Chilingaryan
Optimize FFT size
322
	        cuda_base_buffer + j*alloc_size, fft_real_size, 
13 by Suren A. Chilingaryan
Eleminate last kernel with NxN geometr
323
		lsum_temp + lsum_size * (lsum_alloc + 1), 
324
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
325
		lsum_alloc,
326
	        size, blocks
327
	    );
328
	} else {
15 by Suren A. Chilingaryan
First attempt with CUDA streams
329
	    vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(
330
		cuda_input_buffer + j * side_alloc2, side_alloc, 
16 by Suren A. Chilingaryan
Optimize FFT size
331
	        cuda_base_buffer + j*alloc_size, fft_real_size, 
13 by Suren A. Chilingaryan
Eleminate last kernel with NxN geometr
332
		lsum_temp + lsum_size * (lsum_alloc + 1), 
333
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
334
		lsum_alloc,
17 by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2
335
	        size, ps->base_blocks_power
13 by Suren A. Chilingaryan
Eleminate last kernel with NxN geometr
336
	    );
337
	}
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
338
5 by Suren A. Chilingaryan
Removal of non-zero comments
339
	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
340
	local_sum(ps, 
15 by Suren A. Chilingaryan
First attempt with CUDA streams
341
	    lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
342
	    lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
15 by Suren A. Chilingaryan
First attempt with CUDA streams
343
	    lsum_temp, lsum_temp + lsum_step,
344
	    stream[j%2]);
345
346
//	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
347
      }
348
    }
349
350
    for (int j = 0;j < ncp;j++) {
351
	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
352
    }
353
354
    for (int i = 0; i < 2; ++i) {
355
	cudaStreamDestroy(stream[i]);
356
    }
357
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
358
    if (check_mode) {
359
	ps->minx = minx;
360
	ps->maxx = maxx;
361
	ps->miny = miny;
362
	ps->maxy = maxy;
363
    }
364
    
365
366
    return 0;
367
}
368
19 by Suren A. Chilingaryan
Provide stand-alone library
369
370
static inline int fftCopyFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
371
    int width = ps->width;
372
    int height = ps->height;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
373
374
    int half_size = ps->corr_size;
375
    int size = 2 * half_size + 1;
376
    int size2 = size * size;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
377
    int ncp_alloc = ps->ncp_alloc_size;
19 by Suren A. Chilingaryan
Provide stand-alone library
378
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
379
    float *data_x, *data_y;
380
    if (ps->stored) {
19 by Suren A. Chilingaryan
Provide stand-alone library
381
	data_x = ps->res_x + icp;
382
	data_y = ps->res_y + icp;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
383
    } else {
384
	data_x = ps->points + 2 * ncp_alloc + icp;
385
	data_y = data_x + ncp_alloc;
386
    }
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
387
388
    uint8_t *img = ps->input_buffer;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
389
    uint8_t *banlist = ps->banlist + icp;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
390
391
    for (int i = 0;i < ncp;i++) {
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
392
	float x = data_x[i] - 1;
393
	float y = data_y[i] - 1;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
394
    
395
	int xstart = roundf(x) - half_size;
396
	int ystart = roundf(y) - half_size;
397
    
398
	int xend = xstart + size;
399
	int yend = xstart + size;
400
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
401
	if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
402
	    banlist[i] = 1;
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
403
	    continue;
404
	}
405
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
406
	if (ps->matlab_mode) {
407
	    cudaMemcpy2D(
408
		img + i * size2,//alloc_size,
409
		size * sizeof(uint8_t),
410
	    	fullimg + (xstart * height + ystart),
411
	    	height * sizeof(uint8_t),
412
		size * sizeof(uint8_t),
413
		size,
414
		cudaMemcpyHostToHost
415
	    );
416
	} else {
417
	    cudaMemcpy2D(
418
		img + i * size2,//alloc_size,
419
		size * sizeof(uint8_t),
420
		fullimg + (ystart * width + xstart),
421
		width * sizeof(uint8_t),
422
		size * sizeof(uint8_t),
423
		size,
424
		cudaMemcpyHostToHost
425
	    );
426
	}
19 by Suren A. Chilingaryan
Provide stand-alone library
427
    }
428
    return 0;
429
}
430
431
static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {
432
    int half_size = ps->corr_size;
433
    int size = 2 * half_size + 1;
434
435
    int side_alloc = ps->side_alloc_size;
436
437
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
438
    uint8_t *img = ps->input_buffer;
439
14 by Suren A. Chilingaryan
memcpy3D
440
/*
19 by Suren A. Chilingaryan
Provide stand-alone library
441
    for (int i = 0;i < ncp;i++) {
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
442
	cudaMemcpy2D(
10 by Suren A. Chilingaryan
Block computational kernels
443
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
14 by Suren A. Chilingaryan
memcpy3D
444
	    img + i * size2, size * sizeof(uint8_t),
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
445
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
446
	);
19 by Suren A. Chilingaryan
Provide stand-alone library
447
    }
14 by Suren A. Chilingaryan
memcpy3D
448
*/
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
449
14 by Suren A. Chilingaryan
memcpy3D
450
    cudaMemcpy3DParms copy_params = { 0 };
19 by Suren A. Chilingaryan
Provide stand-alone library
451
14 by Suren A. Chilingaryan
memcpy3D
452
    copy_params.dstPtr   = make_cudaPitchedPtr(
453
	cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
454
    );
455
    copy_params.srcPtr   = make_cudaPitchedPtr(
456
	img, size * sizeof(uint8_t), size, size
457
    );
458
    copy_params.extent   = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
459
    copy_params.kind     = cudaMemcpyHostToDevice;
19 by Suren A. Chilingaryan
Provide stand-alone library
460
461
    cudaMemcpy3DAsync(&copy_params, stream0);
462
    
463
    return 0;
464
}
465
466
static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
467
static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
468
469
static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
470
    int half_size = ps->corr_size;
471
    int size = 2 * half_size + 1;
472
473
    int fft_real_size = ps->fft_real_size;
474
    
475
    int ncp_alloc = ps->ncp_alloc_size;
476
    int alloc_size = ps->fft_alloc_size;
477
    int side_alloc = ps->side_alloc_size;
478
    int side_alloc2 = side_alloc * side_alloc;
479
480
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
481
    float *cuda_data_buffer = ps->cuda_data_buffer;
10 by Suren A. Chilingaryan
Block computational kernels
482
483
    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
484
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
485
    int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
16 by Suren A. Chilingaryan
Optimize FFT size
486
    int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
19 by Suren A. Chilingaryan
Provide stand-alone library
487
488
    float *sumbuf = ps->cuda_points + icp;
489
    float *stdbuf = ps->cuda_points + ncp_alloc + icp;
490
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
491
    int32_t *stat_buf = (int*)ps->cuda_temp_buffer;
7 by Suren A. Chilingaryan
FindPeak optimization
492
10 by Suren A. Chilingaryan
Block computational kernels
493
    dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
19 by Suren A. Chilingaryan
Provide stand-alone library
494
    stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
495
    stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
7 by Suren A. Chilingaryan
FindPeak optimization
496
10 by Suren A. Chilingaryan
Block computational kernels
497
	// Packing input data for FFT
498
    dim3 input_grid_dim(input_blocks, cp_blocks, 1);
499
17 by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2
500
    if (ps->side_blocks_power < 0) {
19 by Suren A. Chilingaryan
Provide stand-alone library
501
        vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(
10 by Suren A. Chilingaryan
Block computational kernels
502
	    cuda_input_buffer, side_alloc2, side_alloc, 
16 by Suren A. Chilingaryan
Optimize FFT size
503
	    cuda_data_buffer, alloc_size, fft_real_size, 
10 by Suren A. Chilingaryan
Block computational kernels
504
	    size, side_blocks
505
	);
506
    } else {
19 by Suren A. Chilingaryan
Provide stand-alone library
507
        vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(
10 by Suren A. Chilingaryan
Block computational kernels
508
	    cuda_input_buffer, side_alloc2, side_alloc, 
16 by Suren A. Chilingaryan
Optimize FFT size
509
	    cuda_data_buffer, alloc_size, fft_real_size, 
17 by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2
510
	    size, ps->side_blocks_power
10 by Suren A. Chilingaryan
Block computational kernels
511
	);
512
    }
19 by Suren A. Chilingaryan
Provide stand-alone library
513
    
514
    return 0;
515
}
516
517
static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
518
    int half_size = ps->corr_size;
519
    int size = 2 * half_size + 1;
520
    int size2 = size * size;
521
522
    int fft_size = ps->fft_size;
523
    int fft_real_size = ps->fft_real_size;
524
    
525
    int ncp_alloc = ps->ncp_alloc_size;
526
    int alloc_size = ps->fft_alloc_size;
527
    int side_alloc = ps->side_alloc_size;
528
529
    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
530
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
531
    int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
532
10 by Suren A. Chilingaryan
Block computational kernels
533
    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
534
    float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
19 by Suren A. Chilingaryan
Provide stand-alone library
535
536
    float *sumbuf = ps->cuda_points + icp;
537
    float *stdbuf = ps->cuda_points + ncp_alloc + icp;
10 by Suren A. Chilingaryan
Block computational kernels
538
    
16 by Suren A. Chilingaryan
Optimize FFT size
539
//    Use real size everthere
540
//    int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
19 by Suren A. Chilingaryan
Provide stand-alone library
541
//    vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
16 by Suren A. Chilingaryan
Optimize FFT size
542
//	cuda_final_buffer,
543
//	cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
544
//	ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
545
//	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
546
//	alloc_size
547
//    );
548
19 by Suren A. Chilingaryan
Provide stand-alone library
549
16 by Suren A. Chilingaryan
Optimize FFT size
550
    int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;
10 by Suren A. Chilingaryan
Block computational kernels
551
    dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);
19 by Suren A. Chilingaryan
Provide stand-alone library
552
553
    vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(
16 by Suren A. Chilingaryan
Optimize FFT size
554
	cuda_final_buffer, fft_size,
555
	cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
556
	ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
557
	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
16 by Suren A. Chilingaryan
Optimize FFT size
558
	alloc_size, fft_blocks
10 by Suren A. Chilingaryan
Block computational kernels
559
    );
19 by Suren A. Chilingaryan
Provide stand-alone library
560
	
10 by Suren A. Chilingaryan
Block computational kernels
561
562
	// Looking for maximum
7 by Suren A. Chilingaryan
FindPeak optimization
563
    float *xbuf = sumbuf;
564
    float *ybuf = stdbuf;
565
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
566
    int32_t *posbuf = (int*)ps->cuda_temp_buffer;
7 by Suren A. Chilingaryan
FindPeak optimization
567
    float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);
568
569
    dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
16 by Suren A. Chilingaryan
Optimize FFT size
570
571
//    Use real size everthere
572
//    find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);
573
//    find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);
19 by Suren A. Chilingaryan
Provide stand-alone library
574
575
    find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
576
    find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);
577
    
578
    return 0;
579
}
580
581
static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
582
    int fft_real_size = ps->fft_real_size;
583
584
    int alloc_size = ps->fft_alloc_size;
585
586
    uint8_t *banlist = ps->banlist + icp;
587
    float *cuda_data_buffer = ps->cuda_data_buffer;
588
589
    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
590
591
	// Performing FFT's
592
    cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;
593
594
    cufftSetStream(ps->cufft_r2c_plan, stream0);
595
    cufftSetStream(ps->cufft_c2r_plan, stream0);
596
    
597
    for (int i = 0;i < ncp;i++) {
598
	if (banlist[i]) continue;
599
	cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
600
    }
601
602
    int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
603
    dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
604
    vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);
605
606
        // First in-place transform for some reason is failing, therefore we
607
	// have one alloc_size spacing between starts (see cuda_fft_buffer set above)
608
    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
609
    for (int i = 0;i < ncp;i++) {
610
	if (banlist[i]) continue;
611
	cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size,  cuda_result_buffer + i * alloc_size);
612
    }
613
614
    return 0;
615
}
616
617
618
static inline int fftGetCurrentPoints(DICTContext ps) {
619
    int ncp = ps->ncp;
620
    int ncp_alloc = ps->ncp_alloc_size;
621
    int precision = ps->precision;
622
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
623
    float *move_x, *move_y;
624
625
	// We do not do a completely correct thing in non-matlab mode, the data
626
	// is copied from image buffer non-transposed as it should be, but 
627
	// the processing code is supports only matlab-mode and handles it as
628
	// standard transposed data. Therefore, here we turning back the
629
	// X and Y coords. But this adds some extra precision penalty.
630
	// Therefore, it is better to use matlab mode until the computation 
631
	// code is changed (this implementation is just done to accept 
632
	// images from user apps without transposing)
633
    if (ps->matlab_mode) {
634
	move_x = ps->points + 6 * ncp_alloc;
635
	move_y = move_x + ncp_alloc;
636
637
	cudaMemcpy2D(
638
    	    move_x, ncp_alloc * sizeof(float),
639
	    ps->cuda_points, ncp_alloc * sizeof(float),
640
	    ps->ncp * sizeof(float), 2,
641
    	    cudaMemcpyDeviceToHost
642
	);
643
    } else {
644
	move_y = ps->points + 6 * ncp_alloc;
21 by Suren A. Chilingaryan
Collection of timing information and fix for a crash in non-matlab mode
645
	move_x = move_y + ncp_alloc;
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
646
647
	cudaMemcpy2D(
648
    	    move_y, ncp_alloc * sizeof(float),
649
	    ps->cuda_points, ncp_alloc * sizeof(float),
650
	    ps->ncp * sizeof(float), 2,
651
    	    cudaMemcpyDeviceToHost
652
	);
653
    }
19 by Suren A. Chilingaryan
Provide stand-alone library
654
655
    float *data_x, *data_y;
656
    if (ps->stored) {
657
        data_x = ps->res_x;
658
        data_y = ps->res_y;
659
    } else {
660
        data_x = ps->points + 2 * ncp_alloc;
661
        data_y = data_x + ncp_alloc;
662
    }
663
664
    float *res_x, *res_y;
665
    if ((ps->res_x)&&(ps->res_y)) {
666
	res_x = ps->res_x;
667
	res_y = ps->res_y;
668
	
669
	ps->stored = 1;
670
    } else {
671
	res_x = data_x;
672
	res_y = data_y;
673
    }
674
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
675
    float frac;
19 by Suren A. Chilingaryan
Provide stand-alone library
676
    float *frac_x = ps->points + 4 * ncp_alloc;
677
    float *frac_y = frac_x + ncp_alloc;
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
678
    uint8_t *banlist = ps->banlist;
679
680
    for (int i = 0;i < ncp;i++) {
19 by Suren A. Chilingaryan
Provide stand-alone library
681
        if (banlist[i]) {
682
            res_x[i] = data_x[i];
683
            res_y[i] = data_y[i];
684
            continue;
685
        }
686
687
        frac = data_x[i] - round(data_x[i]*precision)/precision;
688
        res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);
689
690
        frac = data_y[i] - round(data_y[i]*precision)/precision;
691
        res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
692
    }
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
693
694
    return 0;
695
}