/normxcorr/trunk : contents of dict_hw/src/normxcorr

: (revision 25)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	1	#include "normxcorr_hw.h"
1 by Suren A. Chilingaryan Initial import	2	#include "normxcorr_hw_msg.h"
19 by Suren A. Chilingaryan Provide stand-alone library	3	#include "normxcorr_hw_kernel.cu.h"
19 by Suren A. Chilingaryan Provide stand-alone library	4
1 by Suren A. Chilingaryan Initial import	5
1 by Suren A. Chilingaryan Initial import	6	static void fftFree(TProcessingState *ps) {
10 by Suren A. Chilingaryan Block computational kernels	7	if (ps->banlist) free(ps->banlist);
	8	if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
	9
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	10	if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
	11	if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
10 by Suren A. Chilingaryan Block computational kernels	12	if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
1 by Suren A. Chilingaryan Initial import	13
10 by Suren A. Chilingaryan Block computational kernels	14	if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
	15	if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
	16
	17	if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
	18	if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
	19	if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
	20
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	21	if (ps->cuda_points) cudaFree(ps->cuda_points);
10 by Suren A. Chilingaryan Block computational kernels	22	if (ps->points) cudaFreeHost(ps->points);
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	23
	24	if (ps->cudpp_initialized) {
	25	cudppDestroyPlan(ps->cudpp_plan);
	26	}
	27
	28	if (ps->fft_initialized) {
	29	cufftDestroy(ps->cufft_r2c_plan);
	30	cufftDestroy(ps->cufft_c2r_plan);
	31	}
8 by Suren A. Chilingaryan Complete elimination of cpcorr	32
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	33	if (ps->image_buf) {
	34	dictImageFree(ps);
	35	}
25 by Suren A. Chilingaryan Count hardware initialization time, cmake scripts fixups	36
	37	#ifdef DICT_HW_MEASURE_TIMINGS
	38	memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));
	39	#else /* DICT_HW_MEASURE_TIMINGS */
8 by Suren A. Chilingaryan Complete elimination of cpcorr	40	memset(ps, 0, sizeof(TProcessingState));
25 by Suren A. Chilingaryan Count hardware initialization time, cmake scripts fixups	41	#endif /* DICT_HW_MEASURE_TIMINGS */
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	42
1 by Suren A. Chilingaryan Initial import	43	}
1 by Suren A. Chilingaryan Initial import	44
3 by Suren A. Chilingaryan Add error checking to initialization process	45	static int fftInit(TProcessingState *ps) {
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	46	CUDPPConfiguration cudpp_config;
	47
	48	CUDPPResult cudpp_err;
3 by Suren A. Chilingaryan Add error checking to initialization process	49	cufftResult cufft_err;
	50	cudaError cuda_err;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	51
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	52	int size;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	53	int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	54	int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;
3 by Suren A. Chilingaryan Add error checking to initialization process	55
1 by Suren A. Chilingaryan Initial import	56
16 by Suren A. Chilingaryan Optimize FFT size	57	cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);
3 by Suren A. Chilingaryan Add error checking to initialization process	58	if (cufft_err) {
	59	reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);
19 by Suren A. Chilingaryan Provide stand-alone library	60	return DICT_ERROR_CUFFT;
3 by Suren A. Chilingaryan Add error checking to initialization process	61	}
	62
16 by Suren A. Chilingaryan Optimize FFT size	63	cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_C2R);
3 by Suren A. Chilingaryan Add error checking to initialization process	64	if (cufft_err) {
	65	reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);
	66	cufftDestroy(ps->cufft_r2c_plan);
19 by Suren A. Chilingaryan Provide stand-alone library	67	return DICT_ERROR_CUFFT;
3 by Suren A. Chilingaryan Add error checking to initialization process	68	}
1 by Suren A. Chilingaryan Initial import	69
	70	ps->fft_initialized = true;
	71
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	72	cudpp_config.algorithm = CUDPP_SCAN;
	73	cudpp_config.options = CUDPP_OPTION_FORWARD \| CUDPP_OPTION_INCLUSIVE;
	74	cudpp_config.op = CUDPP_ADD;
	75	cudpp_config.datatype = CUDPP_FLOAT;
	76
	77	cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
	78	if (cudpp_err != CUDPP_SUCCESS) {
	79	reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
	80	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	81	return DICT_ERROR_CUDPP;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	82	}
	83
	84	ps->cudpp_initialized = true;
	85
10 by Suren A. Chilingaryan Block computational kernels	86	cuda_err = cudaMalloc((void*)&ps->cuda_fft_cache, ps->ncp ps->fft_alloc_size * sizeof(cufftComplex));
3 by Suren A. Chilingaryan Add error checking to initialization process	87	if (cuda_err) {
10 by Suren A. Chilingaryan Block computational kernels	88	reportError("Device memory allocation of %u%ucufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan Add error checking to initialization process	89	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	90	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan Add error checking to initialization process	91	}
	92
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	93
7 by Suren A. Chilingaryan FindPeak optimization	94	size = max3(
10 by Suren A. Chilingaryan Block computational kernels	95	(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */
7 by Suren A. Chilingaryan FindPeak optimization	96	2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */
12 by Suren A. Chilingaryan Reduce memory allocation	97	CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	98	);
	99
8 by Suren A. Chilingaryan Complete elimination of cpcorr	100	cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);
3 by Suren A. Chilingaryan Add error checking to initialization process	101	if (cuda_err) {
12 by Suren A. Chilingaryan Reduce memory allocation	102	reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);
3 by Suren A. Chilingaryan Add error checking to initialization process	103	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	104	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan Add error checking to initialization process	105	}
8 by Suren A. Chilingaryan Complete elimination of cpcorr	106
	107	ps->banlist = (uint8_t)malloc(ps->ncp sizeof(uint8_t));
	108	if (!ps->banlist) {
12 by Suren A. Chilingaryan Reduce memory allocation	109	reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);
8 by Suren A. Chilingaryan Complete elimination of cpcorr	110	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	111	return DICT_ERROR_MALLOC;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	112	}
	113	memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
	114
	115	cuda_err = cudaHostAlloc((void*)&ps->points, 8 ps->ncp_alloc_size * sizeof(float), 0);
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	116	if (cuda_err) {
8 by Suren A. Chilingaryan Complete elimination of cpcorr	117	reportError("Page locked host memory allocation of 8%ufloat bytes for control points is failed", ps->ncp_alloc_size);
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	118	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	119	return DICT_ERROR_CUDA_MALLOC;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	120	}
	121
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	122	cuda_err = cudaMalloc((void*)&ps->cuda_points, 2 ps->ncp_alloc_size * sizeof(float));
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	123	if (cuda_err) {
12 by Suren A. Chilingaryan Reduce memory allocation	124	reportError("Device memory allocation of 2%ufloat bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
12 by Suren A. Chilingaryan Reduce memory allocation	125	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	126	return DICT_ERROR_CUDA_MALLOC;
12 by Suren A. Chilingaryan Reduce memory allocation	127	}
	128
	129	cuda_err = cudaMalloc((void*)&ps->cuda_input_buffer, CP_BLOCK side_alloc_size2 * sizeof(uint8_t));
	130	if (cuda_err) {
	131	reportError("Device memory allocation of %u%uuint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);
	132	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	133	return DICT_ERROR_CUDA_MALLOC;
12 by Suren A. Chilingaryan Reduce memory allocation	134	}
	135
	136	cuda_err = cudaHostAlloc((void*)&ps->input_buffer, CP_BLOCK ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
	137	if (cuda_err) {
	138	reportError("Host memory allocation of %u%uuint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
3 by Suren A. Chilingaryan Add error checking to initialization process	139	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	140	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan Add error checking to initialization process	141	}
23 by Suren A. Chilingaryan Print timings using reportMessage calls	142
	143	// DS: We don't actually need that to be CP_BLOCK, just unblock computations in loadbase and set to single
15 by Suren A. Chilingaryan First attempt with CUDA streams	144	cuda_err = cudaMalloc((void*)&ps->cuda_base_buffer, CP_BLOCK ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan Complete elimination of cpcorr	145	if (cuda_err) {
	146	reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);
	147	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	148	return DICT_ERROR_CUDA_MALLOC;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	149	}
15 by Suren A. Chilingaryan First attempt with CUDA streams	150	cudaMemset((void)ps->cuda_base_buffer, 0, CP_BLOCK ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan Complete elimination of cpcorr	151
10 by Suren A. Chilingaryan Block computational kernels	152	cuda_err = cudaMalloc((void*)&ps->cuda_data_buffer, CP_BLOCK ps->fft_alloc_size * sizeof(cufftReal));
8 by Suren A. Chilingaryan Complete elimination of cpcorr	153	if (cuda_err) {
10 by Suren A. Chilingaryan Block computational kernels	154	reportError("Device memory allocation of %u%ucufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
8 by Suren A. Chilingaryan Complete elimination of cpcorr	155	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	156	return DICT_ERROR_CUDA_MALLOC;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	157	}
10 by Suren A. Chilingaryan Block computational kernels	158	cudaMemset((void)ps->cuda_data_buffer, 0, CP_BLOCK ps->fft_alloc_size * sizeof(cufftReal));
1 by Suren A. Chilingaryan Initial import	159
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	160	cuda_err = cudaMalloc((void*)&ps->cuda_lsum_cache, ps->ncp ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
3 by Suren A. Chilingaryan Add error checking to initialization process	161	if (cuda_err) {
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	162	reportError("Device memory allocation of %u%ufloat bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan Add error checking to initialization process	163	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	164	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan Add error checking to initialization process	165	}
	166
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	167	cuda_err = cudaMalloc((void*)&ps->cuda_denom_cache, ps->ncp ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
3 by Suren A. Chilingaryan Add error checking to initialization process	168	if (cuda_err) {
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	169	reportError("Device memory allocation of %u%ufloat bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);
3 by Suren A. Chilingaryan Add error checking to initialization process	170	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	171	return DICT_ERROR_CUDA_MALLOC;
3 by Suren A. Chilingaryan Add error checking to initialization process	172	}
	173
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	174	cuda_err = cudaMalloc((void*)&ps->cuda_lsum_temp, 4 lsum_alloc_size2 * sizeof(float));
	175	if (cuda_err) {
	176	reportError("Device memory allocation of 4%ufloat bytes for lsum temporary buffer is failed", lsum_alloc_size2);
	177	fftFree(ps);
19 by Suren A. Chilingaryan Provide stand-alone library	178	return DICT_ERROR_MALLOC;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	179	}
	180	// We need to zero temporary buffers as well, since we are not computing
	181	// cumsum of complete matrix, but non-zero part of it
	182	cudaMemset((void)ps->cuda_lsum_temp, 0, 4 lsum_alloc_size2 * sizeof(float));
19 by Suren A. Chilingaryan Provide stand-alone library	183
3 by Suren A. Chilingaryan Add error checking to initialization process	184	return 0;
1 by Suren A. Chilingaryan Initial import	185	}
	186
	187
	188	void pstateFree(TProcessingState *ps) {
	189	if (ps) {
	190	fftFree(ps);
	191	free(ps);
	192	}
	193	}
	194
	195	TProcessingState *pstateInit() {
	196	TProcessingState *ps;
	197
	198	ps = (TProcessingState*)malloc(sizeof(TProcessingState));
8 by Suren A. Chilingaryan Complete elimination of cpcorr	199	if (ps) memset(ps, 0, sizeof(TProcessingState));
19 by Suren A. Chilingaryan Provide stand-alone library	200
1 by Suren A. Chilingaryan Initial import	201	return ps;
1 by Suren A. Chilingaryan Initial import	202	}
8 by Suren A. Chilingaryan Complete elimination of cpcorr	203
19 by Suren A. Chilingaryan Provide stand-alone library	204	static inline int fftLoadBaseFragment(TProcessingState ps, int icp, int ncp, const unsigned char fullimg) {
	205	int width = ps->width;
	206	int height = ps->height;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	207
	208	int check_mode = ((ps->base_mode)&&(!ps->mode));
	209	float minx, miny, maxx, maxy;
	210
	211	int precision = ps->precision;
	212
	213	int half_size = 2 * ps->corr_size;
	214	int size = 2 * half_size + 1;
	215
16 by Suren A. Chilingaryan Optimize FFT size	216	int fft_real_size = ps->fft_real_size;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	217
8 by Suren A. Chilingaryan Complete elimination of cpcorr	218	int ncp_alloc = ps->ncp_alloc_size;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	219	int alloc_size = ps->fft_alloc_size;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	220	int side_alloc = ps->side_alloc_size;
	221	int side_alloc2 = side_alloc * side_alloc;
	222
8 by Suren A. Chilingaryan Complete elimination of cpcorr	223	uint8_t *banlist = ps->banlist + icp;
	224
	225	float *data_x = ps->points + icp;
	226	float *data_y = data_x + ncp_alloc;
	227
	228	float frac_x = ps->points + 4 ncp_alloc + icp;
	229	float *frac_y = frac_x + ncp_alloc;
	230
	231	uint8_t *img = ps->input_buffer;
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	232
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	233	float lsum_temp = (float)ps->cuda_lsum_temp;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	234	int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
	235
	236	if (check_mode) {
	237	minx = ps->minx;
	238	maxx = ps->maxx;
	239	miny = ps->miny;
	240	maxy = ps->maxy;
	241	}
	242
12 by Suren A. Chilingaryan Reduce memory allocation	243	uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
	244	cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
	245	cufftComplex cache = ps->cuda_fft_cache + icp alloc_size;
	246	float lsum_cache = ps->cuda_lsum_cache + icp alloc_size;
	247	float denom_cache = ps->cuda_denom_cache + icp alloc_size;
	248
13 by Suren A. Chilingaryan Eleminate last kernel with NxN geometr	249	int blocks = calc_blocks(size, BLOCK_SIZE_1D);
	250	int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
	251
	252	int lsum_size = ps->lsum_size;
	253	int lsum_alloc = ps->lsum_alloc_size;
	254
15 by Suren A. Chilingaryan First attempt with CUDA streams	255	cudaStream_t stream[2];
	256	for (int i = 0; i < 2; ++i) {
	257	cudaStreamCreate(&stream[i]);
	258	}
	259
	260	for (int i = 0;i <= ncp;i++) {
	261	if (i < ncp) {
	262	float x = data_x[i] - 1;
	263	float y = data_y[i] - 1;
	264
	265	frac_x[i] = x - round(x * precision) / precision;
	266	frac_y[i] = y - round(y * precision) / precision;
	267
	268	int xstart = roundf(x) - half_size;
	269	int ystart = roundf(y) - half_size;
	270
	271	int xend = xstart + size;
	272	int yend = xstart + size;
	273
	274	if ((xstart < 0)\|\|(ystart < 0)\|\|(xend >= width)\|\|(yend >= height)) {
	275	continue;
	276	}
	277
	278	if (check_mode) {
	279	if (xstart < minx) minx = xstart;
	280	if (ystart < miny) miny = ystart;
	281	if (xend > maxx) maxx = xend;
	282	if (yend > maxy) maxy = yend;
	283	}
	284
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	285	if (ps->matlab_mode) {
	286	cudaMemcpy2D(
	287	img + i * alloc_size,
	288	size * sizeof(uint8_t),
	289	fullimg + (xstart * height + ystart),
	290	height * sizeof(uint8_t),
	291	size * sizeof(uint8_t),
	292	size,
	293	cudaMemcpyHostToHost
	294	);
	295	} else {
	296	cudaMemcpy2D(
	297	img + i * alloc_size,
	298	size * sizeof(uint8_t),
	299	fullimg + (ystart * width + xstart),
	300	width * sizeof(uint8_t),
	301	size * sizeof(uint8_t),
	302	size,
	303	cudaMemcpyHostToHost
	304	);
	305	}
15 by Suren A. Chilingaryan First attempt with CUDA streams	306
	307	cudaMemcpy2DAsync(
	308	cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
	309	img + i * alloc_size, size * sizeof(uint8_t),
	310	size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,
	311	stream[i%2]
	312	);
	313
	314	banlist[i] = 0;
	315	}
	316	if (i > 0) {
	317	int j = i - 1;
12 by Suren A. Chilingaryan Reduce memory allocation	318
17 by Suren A. Chilingaryan Precompute if side and base blocks amount is power of 2	319	if (ps->base_blocks_power < 0) {
15 by Suren A. Chilingaryan First attempt with CUDA streams	320	vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(
15 by Suren A. Chilingaryan First attempt with CUDA streams	321	cuda_input_buffer + j * side_alloc2, side_alloc,
16 by Suren A. Chilingaryan Optimize FFT size	322	cuda_base_buffer + j*alloc_size, fft_real_size,
13 by Suren A. Chilingaryan Eleminate last kernel with NxN geometr	323	lsum_temp + lsum_size * (lsum_alloc + 1),
	324	lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
	325	lsum_alloc,
	326	size, blocks
	327	);
	328	} else {
15 by Suren A. Chilingaryan First attempt with CUDA streams	329	vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(
15 by Suren A. Chilingaryan First attempt with CUDA streams	330	cuda_input_buffer + j * side_alloc2, side_alloc,
16 by Suren A. Chilingaryan Optimize FFT size	331	cuda_base_buffer + j*alloc_size, fft_real_size,
13 by Suren A. Chilingaryan Eleminate last kernel with NxN geometr	332	lsum_temp + lsum_size * (lsum_alloc + 1),
	333	lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
	334	lsum_alloc,
17 by Suren A. Chilingaryan Precompute if side and base blocks amount is power of 2	335	size, ps->base_blocks_power
13 by Suren A. Chilingaryan Eleminate last kernel with NxN geometr	336	);
	337	}
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	338
5 by Suren A. Chilingaryan Removal of non-zero comments	339	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
8 by Suren A. Chilingaryan Complete elimination of cpcorr	340	local_sum(ps,
15 by Suren A. Chilingaryan First attempt with CUDA streams	341	lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
8 by Suren A. Chilingaryan Complete elimination of cpcorr	342	lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
15 by Suren A. Chilingaryan First attempt with CUDA streams	343	lsum_temp, lsum_temp + lsum_step,
	344	stream[j%2]);
	345
	346	// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
	347	}
	348	}
	349
	350	for (int j = 0;j < ncp;j++) {
	351	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
	352	}
	353
	354	for (int i = 0; i < 2; ++i) {
	355	cudaStreamDestroy(stream[i]);
	356	}
	357
8 by Suren A. Chilingaryan Complete elimination of cpcorr	358	if (check_mode) {
	359	ps->minx = minx;
	360	ps->maxx = maxx;
	361	ps->miny = miny;
	362	ps->maxy = maxy;
	363	}
	364
	365
	366	return 0;
	367	}
	368
19 by Suren A. Chilingaryan Provide stand-alone library	369
	370	static inline int fftCopyFragment(TProcessingState ps, int icp, int ncp, const unsigned char fullimg) {
	371	int width = ps->width;
	372	int height = ps->height;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	373
	374	int half_size = ps->corr_size;
	375	int size = 2 * half_size + 1;
	376	int size2 = size * size;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	377	int ncp_alloc = ps->ncp_alloc_size;
19 by Suren A. Chilingaryan Provide stand-alone library	378
8 by Suren A. Chilingaryan Complete elimination of cpcorr	379	float data_x, data_y;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	380	if (ps->stored) {
19 by Suren A. Chilingaryan Provide stand-alone library	381	data_x = ps->res_x + icp;
19 by Suren A. Chilingaryan Provide stand-alone library	382	data_y = ps->res_y + icp;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	383	} else {
	384	data_x = ps->points + 2 * ncp_alloc + icp;
	385	data_y = data_x + ncp_alloc;
	386	}
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	387
	388	uint8_t *img = ps->input_buffer;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	389	uint8_t *banlist = ps->banlist + icp;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	390
	391	for (int i = 0;i < ncp;i++) {
8 by Suren A. Chilingaryan Complete elimination of cpcorr	392	float x = data_x[i] - 1;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	393	float y = data_y[i] - 1;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	394
	395	int xstart = roundf(x) - half_size;
	396	int ystart = roundf(y) - half_size;
	397
	398	int xend = xstart + size;
	399	int yend = xstart + size;
	400
8 by Suren A. Chilingaryan Complete elimination of cpcorr	401	if ((banlist[i])\|\|(xstart < 0)\|\|(ystart < 0)\|\|(xend >= width)\|\|(yend >= height)) {
8 by Suren A. Chilingaryan Complete elimination of cpcorr	402	banlist[i] = 1;
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	403	continue;
	404	}
	405
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	406	if (ps->matlab_mode) {
	407	cudaMemcpy2D(
	408	img + i * size2,//alloc_size,
	409	size * sizeof(uint8_t),
	410	fullimg + (xstart * height + ystart),
	411	height * sizeof(uint8_t),
	412	size * sizeof(uint8_t),
	413	size,
	414	cudaMemcpyHostToHost
	415	);
	416	} else {
	417	cudaMemcpy2D(
	418	img + i * size2,//alloc_size,
	419	size * sizeof(uint8_t),
	420	fullimg + (ystart * width + xstart),
	421	width * sizeof(uint8_t),
	422	size * sizeof(uint8_t),
	423	size,
	424	cudaMemcpyHostToHost
	425	);
	426	}
19 by Suren A. Chilingaryan Provide stand-alone library	427	}
	428	return 0;
	429	}
	430
	431	static inline int fftLoadFragment(TProcessingState ps, int icp, int ncp, const unsigned char image, cudaStream_t stream0) {
	432	int half_size = ps->corr_size;
	433	int size = 2 * half_size + 1;
	434
	435	int side_alloc = ps->side_alloc_size;
	436
	437	uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
	438	uint8_t *img = ps->input_buffer;
	439
14 by Suren A. Chilingaryan memcpy3D	440	/*
19 by Suren A. Chilingaryan Provide stand-alone library	441	for (int i = 0;i < ncp;i++) {
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	442	cudaMemcpy2D(
10 by Suren A. Chilingaryan Block computational kernels	443	cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
14 by Suren A. Chilingaryan memcpy3D	444	img + i * size2, size * sizeof(uint8_t),
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	445	size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
	446	);
19 by Suren A. Chilingaryan Provide stand-alone library	447	}
14 by Suren A. Chilingaryan memcpy3D	448	*/
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	449
14 by Suren A. Chilingaryan memcpy3D	450	cudaMemcpy3DParms copy_params = { 0 };
19 by Suren A. Chilingaryan Provide stand-alone library	451
14 by Suren A. Chilingaryan memcpy3D	452	copy_params.dstPtr = make_cudaPitchedPtr(
	453	cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
	454	);
	455	copy_params.srcPtr = make_cudaPitchedPtr(
	456	img, size * sizeof(uint8_t), size, size
	457	);
	458	copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
	459	copy_params.kind = cudaMemcpyHostToDevice;
19 by Suren A. Chilingaryan Provide stand-alone library	460
	461	cudaMemcpy3DAsync(&copy_params, stream0);
	462
	463	return 0;
	464	}
	465
	466	static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
	467	static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
	468
	469	static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
	470	int half_size = ps->corr_size;
	471	int size = 2 * half_size + 1;
	472
	473	int fft_real_size = ps->fft_real_size;
	474
	475	int ncp_alloc = ps->ncp_alloc_size;
	476	int alloc_size = ps->fft_alloc_size;
	477	int side_alloc = ps->side_alloc_size;
	478	int side_alloc2 = side_alloc * side_alloc;
	479
	480	uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
	481	float *cuda_data_buffer = ps->cuda_data_buffer;
10 by Suren A. Chilingaryan Block computational kernels	482
	483	int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
	484	int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
	485	int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
16 by Suren A. Chilingaryan Optimize FFT size	486	int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
19 by Suren A. Chilingaryan Provide stand-alone library	487
	488	float *sumbuf = ps->cuda_points + icp;
	489	float *stdbuf = ps->cuda_points + ncp_alloc + icp;
	490
8 by Suren A. Chilingaryan Complete elimination of cpcorr	491	int32_t stat_buf = (int)ps->cuda_temp_buffer;
7 by Suren A. Chilingaryan FindPeak optimization	492
10 by Suren A. Chilingaryan Block computational kernels	493	dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
19 by Suren A. Chilingaryan Provide stand-alone library	494	stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
19 by Suren A. Chilingaryan Provide stand-alone library	495	stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);
7 by Suren A. Chilingaryan FindPeak optimization	496
10 by Suren A. Chilingaryan Block computational kernels	497	// Packing input data for FFT
	498	dim3 input_grid_dim(input_blocks, cp_blocks, 1);
	499
17 by Suren A. Chilingaryan Precompute if side and base blocks amount is power of 2	500	if (ps->side_blocks_power < 0) {
19 by Suren A. Chilingaryan Provide stand-alone library	501	vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(
10 by Suren A. Chilingaryan Block computational kernels	502	cuda_input_buffer, side_alloc2, side_alloc,
16 by Suren A. Chilingaryan Optimize FFT size	503	cuda_data_buffer, alloc_size, fft_real_size,
10 by Suren A. Chilingaryan Block computational kernels	504	size, side_blocks
	505	);
	506	} else {
19 by Suren A. Chilingaryan Provide stand-alone library	507	vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(
10 by Suren A. Chilingaryan Block computational kernels	508	cuda_input_buffer, side_alloc2, side_alloc,
16 by Suren A. Chilingaryan Optimize FFT size	509	cuda_data_buffer, alloc_size, fft_real_size,
17 by Suren A. Chilingaryan Precompute if side and base blocks amount is power of 2	510	size, ps->side_blocks_power
10 by Suren A. Chilingaryan Block computational kernels	511	);
10 by Suren A. Chilingaryan Block computational kernels	512	}
19 by Suren A. Chilingaryan Provide stand-alone library	513
	514	return 0;
	515	}
	516
	517	static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
	518	int half_size = ps->corr_size;
	519	int size = 2 * half_size + 1;
	520	int size2 = size * size;
	521
	522	int fft_size = ps->fft_size;
	523	int fft_real_size = ps->fft_real_size;
	524
	525	int ncp_alloc = ps->ncp_alloc_size;
	526	int alloc_size = ps->fft_alloc_size;
	527	int side_alloc = ps->side_alloc_size;
	528
	529	int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
	530	int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
	531	int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
	532
10 by Suren A. Chilingaryan Block computational kernels	533	cufftReal cuda_result_buffer = (cufftReal)ps->cuda_temp_buffer;
10 by Suren A. Chilingaryan Block computational kernels	534	float cuda_final_buffer = cuda_result_buffer + CP_BLOCK alloc_size;
19 by Suren A. Chilingaryan Provide stand-alone library	535
	536	float *sumbuf = ps->cuda_points + icp;
	537	float *stdbuf = ps->cuda_points + ncp_alloc + icp;
10 by Suren A. Chilingaryan Block computational kernels	538
16 by Suren A. Chilingaryan Optimize FFT size	539	// Use real size everthere
16 by Suren A. Chilingaryan Optimize FFT size	540	// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
19 by Suren A. Chilingaryan Provide stand-alone library	541	// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
16 by Suren A. Chilingaryan Optimize FFT size	542	// cuda_final_buffer,
	543	// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
	544	// ps->cuda_lsum_cache + icpalloc_size, sumbuf, 1. / (size2 (size2 - 1)),
	545	// ps->cuda_denom_cache + icp*alloc_size, stdbuf,
	546	// alloc_size
	547	// );
	548
19 by Suren A. Chilingaryan Provide stand-alone library	549
16 by Suren A. Chilingaryan Optimize FFT size	550	int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;
10 by Suren A. Chilingaryan Block computational kernels	551	dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);
19 by Suren A. Chilingaryan Provide stand-alone library	552
19 by Suren A. Chilingaryan Provide stand-alone library	553	vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(
16 by Suren A. Chilingaryan Optimize FFT size	554	cuda_final_buffer, fft_size,
16 by Suren A. Chilingaryan Optimize FFT size	555	cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	556	ps->cuda_lsum_cache + icpalloc_size, sumbuf, 1. / (size2 (size2 - 1)),
	557	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
16 by Suren A. Chilingaryan Optimize FFT size	558	alloc_size, fft_blocks
10 by Suren A. Chilingaryan Block computational kernels	559	);
19 by Suren A. Chilingaryan Provide stand-alone library	560
10 by Suren A. Chilingaryan Block computational kernels	561
10 by Suren A. Chilingaryan Block computational kernels	562	// Looking for maximum
7 by Suren A. Chilingaryan FindPeak optimization	563	float *xbuf = sumbuf;
	564	float *ybuf = stdbuf;
	565
8 by Suren A. Chilingaryan Complete elimination of cpcorr	566	int32_t posbuf = (int)ps->cuda_temp_buffer;
7 by Suren A. Chilingaryan FindPeak optimization	567	float maxbuf = (float)(posbuf + CP_BLOCK*side_alloc);
	568
	569	dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
16 by Suren A. Chilingaryan Optimize FFT size	570
	571	// Use real size everthere
	572	// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);
	573	// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
19 by Suren A. Chilingaryan Provide stand-alone library	574
	575	find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
	576	find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);
	577
	578	return 0;
	579	}
	580
	581	static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
	582	int fft_real_size = ps->fft_real_size;
	583
	584	int alloc_size = ps->fft_alloc_size;
	585
	586	uint8_t *banlist = ps->banlist + icp;
	587	float *cuda_data_buffer = ps->cuda_data_buffer;
	588
	589	int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
	590
	591	// Performing FFT's
	592	cufftComplex cuda_fft_buffer = ((cufftComplex)ps->cuda_temp_buffer) + alloc_size;
	593
	594	cufftSetStream(ps->cufft_r2c_plan, stream0);
	595	cufftSetStream(ps->cufft_c2r_plan, stream0);
	596
	597	for (int i = 0;i < ncp;i++) {
	598	if (banlist[i]) continue;
	599	cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
	600	}
	601
	602	int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
	603	dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
	604	vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);
	605
	606	// First in-place transform for some reason is failing, therefore we
	607	// have one alloc_size spacing between starts (see cuda_fft_buffer set above)
	608	cufftReal cuda_result_buffer = (cufftReal)ps->cuda_temp_buffer;
	609	for (int i = 0;i < ncp;i++) {
	610	if (banlist[i]) continue;
	611	cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);
	612	}
	613
	614	return 0;
	615	}
	616
	617
	618	static inline int fftGetCurrentPoints(DICTContext ps) {
	619	int ncp = ps->ncp;
	620	int ncp_alloc = ps->ncp_alloc_size;
	621	int precision = ps->precision;
	622
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	623	float move_x, move_y;
	624
	625	// We do not do a completely correct thing in non-matlab mode, the data
	626	// is copied from image buffer non-transposed as it should be, but
	627	// the processing code is supports only matlab-mode and handles it as
	628	// standard transposed data. Therefore, here we turning back the
	629	// X and Y coords. But this adds some extra precision penalty.
	630	// Therefore, it is better to use matlab mode until the computation
	631	// code is changed (this implementation is just done to accept
	632	// images from user apps without transposing)
	633	if (ps->matlab_mode) {
	634	move_x = ps->points + 6 * ncp_alloc;
	635	move_y = move_x + ncp_alloc;
	636
	637	cudaMemcpy2D(
	638	move_x, ncp_alloc * sizeof(float),
	639	ps->cuda_points, ncp_alloc * sizeof(float),
	640	ps->ncp * sizeof(float), 2,
	641	cudaMemcpyDeviceToHost
	642	);
	643	} else {
	644	move_y = ps->points + 6 * ncp_alloc;
21 by Suren A. Chilingaryan Collection of timing information and fix for a crash in non-matlab mode	645	move_x = move_y + ncp_alloc;
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	646
	647	cudaMemcpy2D(
	648	move_y, ncp_alloc * sizeof(float),
	649	ps->cuda_points, ncp_alloc * sizeof(float),
	650	ps->ncp * sizeof(float), 2,
	651	cudaMemcpyDeviceToHost
	652	);
	653	}
19 by Suren A. Chilingaryan Provide stand-alone library	654
	655	float data_x, data_y;
	656	if (ps->stored) {
	657	data_x = ps->res_x;
	658	data_y = ps->res_y;
	659	} else {
	660	data_x = ps->points + 2 * ncp_alloc;
	661	data_y = data_x + ncp_alloc;
	662	}
	663
	664	float res_x, res_y;
	665	if ((ps->res_x)&&(ps->res_y)) {
	666	res_x = ps->res_x;
	667	res_y = ps->res_y;
	668
	669	ps->stored = 1;
	670	} else {
	671	res_x = data_x;
	672	res_y = data_y;
	673	}
	674
8 by Suren A. Chilingaryan Complete elimination of cpcorr	675	float frac;
19 by Suren A. Chilingaryan Provide stand-alone library	676	float frac_x = ps->points + 4 ncp_alloc;
19 by Suren A. Chilingaryan Provide stand-alone library	677	float *frac_y = frac_x + ncp_alloc;
8 by Suren A. Chilingaryan Complete elimination of cpcorr	678	uint8_t *banlist = ps->banlist;
	679
	680	for (int i = 0;i < ncp;i++) {
19 by Suren A. Chilingaryan Provide stand-alone library	681	if (banlist[i]) {
	682	res_x[i] = data_x[i];
	683	res_y[i] = data_y[i];
	684	continue;
	685	}
	686
	687	frac = data_x[i] - round(data_x[i]*precision)/precision;
	688	res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);
	689
	690	frac = data_y[i] - round(data_y[i]*precision)/precision;
	691	res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
	692	}
8 by Suren A. Chilingaryan Complete elimination of cpcorr	693
	694	return 0;
	695	}