/normxcorr/trunk : contents of dict_hw/src/normxcorr

: (revision 25)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	1	#ifndef NORMXCORR_HW_H
	2	#define NORMXCORR_HW_H
	3	#include <cuda.h>
	4	#include <cuda_runtime.h>
	5
	6	#include <cublas.h>
	7	#include <cufft.h>
	8
	9	#include <cudpp.h>
	10
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	11	#include "dict_image.h"
	12
22 by Suren A. Chilingaryan Optimize image reduction	13	#define TRANSPOSE_SIZE 64 // should equal to CPU cache line size
22 by Suren A. Chilingaryan Optimize image reduction	14
10 by Suren A. Chilingaryan Block computational kernels	15	#define BLOCK_SIZE_1D 64
10 by Suren A. Chilingaryan Block computational kernels	16	#define BLOCK_SIZE_2D 16
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	17
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	18	#define CP_BLOCK 256 // should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
	19	#define CP_BLOCK_SIZE BLOCK_SIZE_2D
	20	#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
	21
8 by Suren A. Chilingaryan Complete elimination of cpcorr	22	#define USE_UNDOCUMENTED
25 by Suren A. Chilingaryan Count hardware initialization time, cmake scripts fixups	23	//#define USE_SSE
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	24
	25	struct STProcessingState {
8 by Suren A. Chilingaryan Complete elimination of cpcorr	26	int stored; // flag indicating if we already have coordinates in coords stored
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	27
19 by Suren A. Chilingaryan Provide stand-alone library	28	unsigned char *banlist; // control points banned from computations for various reasons
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	29
	30
8 by Suren A. Chilingaryan Complete elimination of cpcorr	31	float *points; // various information on control points
	32	// base_x, base_y, data_x, data_y
	33	// base_frac_x, base_frac_y
	34	// move_x, move_y
	35
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	36
	37	float *cuda_points; // Various information on control points:
	38	// 0: data_x
	39	// 1: data_y
	40	// 2: sum
	41	// 3: denom
8 by Suren A. Chilingaryan Complete elimination of cpcorr	42
19 by Suren A. Chilingaryan Provide stand-alone library	43	unsigned char *input_buffer; // Input Image buffer / Host
19 by Suren A. Chilingaryan Provide stand-alone library	44	unsigned char *cuda_input_buffer; // Input Image buffer / Device
8 by Suren A. Chilingaryan Complete elimination of cpcorr	45
	46	cufftReal *cuda_base_buffer; // Temporary buffer for FFT inputs, pre-zeroed
	47	cufftReal *cuda_data_buffer; // Temporary buffer for FFT inputs, pre-zeroed
	48
	49	void *cuda_temp_buffer; // Main computational buffer, temporary
	50
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	51
10 by Suren A. Chilingaryan Block computational kernels	52	cufftComplex *cuda_fft_cache; // Stored FFT's of the template image
11 by Suren A. Chilingaryan Enforce naming conventions for buffers and caches	53
	54	float *cuda_lsum_cache; // Cache of local sums
	55	float *cuda_denom_cache; // Cahce of denoms
	56
	57	float *cuda_lsum_temp; // Temporary buffer for local sum comp, first two pre-zeroed
	58
19 by Suren A. Chilingaryan Provide stand-alone library	59
	60	float *res_x; // External points buffer
	61	float *res_y; // External points buffer
	62
8 by Suren A. Chilingaryan Complete elimination of cpcorr	63
	64	int fft_initialized; // Flag indicating if CUFFT plan is initialized
	65	cufftHandle cufft_plan;
	66	cufftHandle cufft_r2c_plan;
	67	cufftHandle cufft_c2r_plan;
	68
	69	int cudpp_initialized; // Flag indicating if CUDPP plan is initialized
	70	CUDPPHandle cudpp_plan;
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	71
	72	int matlab_mode; // 2D stored transposed (i.e. by columns)
	73
8 by Suren A. Chilingaryan Complete elimination of cpcorr	74	int mode; // 1 - image mode, 0 - fragment mode
	75	int base_mode; // 1 - image mode, 0 - fragment mode
	76
	77	float minx,miny,maxx,maxy; // Coordinates of actualy used rectangle of image
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	78
	79	int ncp; // Number of control points
7 by Suren A. Chilingaryan FindPeak optimization	80	int ncp_alloc_size;
19 by Suren A. Chilingaryan Provide stand-alone library	81
	82	int width; // Images width
	83	int height; // Images height
7 by Suren A. Chilingaryan FindPeak optimization	84
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	85	int corr_size; // CORR_SIZE
8 by Suren A. Chilingaryan Complete elimination of cpcorr	86	int precision; // PRECISION
6 by Suren A. Chilingaryan A little more computations are moved to CUDA	87
	88	int side_alloc_size; // allocation size for 1 side of fft
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	89
	90	int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
16 by Suren A. Chilingaryan Optimize FFT size	91	int fft_real_size; // Rounded to next power of 2
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	92	int fft_alloc_size; // cuda optimized size2
	93
	94	int subimage_size; // Size of neighborhood (4*corr_size + 1)
	95	int lsum_size; // Dimmensions of local sums (2*corr_size + 1)
	96	int lsum_temp_size; // Matrix Size for computing local sums
	97	int lsum_alloc_size; // Size of allocated line to store lsum row
	98	int lsum_aligned_size; // CUDA optimized lsum_temp_size
	99	int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
	100
17 by Suren A. Chilingaryan Precompute if side and base blocks amount is power of 2	101	int side_blocks_power; // Indicates if amount of side blocks is power of 2
	102	int base_blocks_power; // Indicates if amount of base blocks is power of 2
20 by Suren A. Chilingaryan Support for TIFF images in C code and stand-alone console application	103
	104
	105	DICTImageType image_type;
	106	void *image_buf; // Temporary buffer for images (specific-format)
	107	unsigned char *image; // Reduced format
21 by Suren A. Chilingaryan Collection of timing information and fix for a crash in non-matlab mode	108
	109	#ifdef DICT_HW_MEASURE_TIMINGS
25 by Suren A. Chilingaryan Count hardware initialization time, cmake scripts fixups	110	int time[24]; // Timing counters (SHOULD BE LAST in struct!!!):
21 by Suren A. Chilingaryan Collection of timing information and fix for a crash in non-matlab mode	111	// 0- 3: hardware, init, set_base_points, set_cur_points
	112	// 4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
	113	// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
	114	// 16 :get_results
	115	#endif /* DICT_HW_MEASURE_TIMINGS */
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	116	};
	117
	118	typedef struct STProcessingState TProcessingState;
	119
8 by Suren A. Chilingaryan Complete elimination of cpcorr	120
8 by Suren A. Chilingaryan Complete elimination of cpcorr	121
4 by Suren A. Chilingaryan Instead of transfer compute local sums and denormals on board	122	#endif /* NORMXCORR_HW_H */