/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
1
#ifndef NORMXCORR_HW_H
2
#define NORMXCORR_HW_H
3
#include <cuda.h>
4
#include <cuda_runtime.h>
5
6
#include <cublas.h>
7
#include <cufft.h>
8
9
#include <cudpp.h>
10
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
11
#include "dict_image.h"
12
22 by Suren A. Chilingaryan
Optimize image reduction
13
#define TRANSPOSE_SIZE 64	// should equal to CPU cache line size
14
10 by Suren A. Chilingaryan
Block computational kernels
15
#define BLOCK_SIZE_1D 		64
16
#define BLOCK_SIZE_2D 		16
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
17
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
18
#define CP_BLOCK 256		// should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
19
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
20
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
21
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
22
#define USE_UNDOCUMENTED
25 by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups
23
//#define USE_SSE
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
24
25
struct STProcessingState {
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
26
    int stored;				// flag indicating if we already have coordinates in coords stored
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
27
19 by Suren A. Chilingaryan
Provide stand-alone library
28
    unsigned char *banlist;		// control points banned from computations for various reasons
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
29
30
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
31
    float *points;			// various information on control points
32
					// 	base_x, base_y, data_x, data_y
33
					//	base_frac_x, base_frac_y
34
					//	move_x, move_y
35
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
36
37
    float *cuda_points;			// Various information on control points:
38
					// 0: data_x
39
					// 1: data_y
40
					// 2: sum
41
					// 3: denom
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
42
19 by Suren A. Chilingaryan
Provide stand-alone library
43
    unsigned char *input_buffer;	// Input Image buffer / Host
44
    unsigned char *cuda_input_buffer;	// Input Image buffer / Device
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
45
46
    cufftReal *cuda_base_buffer;	// Temporary buffer for FFT inputs, pre-zeroed
47
    cufftReal *cuda_data_buffer;	// Temporary buffer for FFT inputs, pre-zeroed
48
49
    void *cuda_temp_buffer;		// Main computational buffer, temporary
50
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
51
    
10 by Suren A. Chilingaryan
Block computational kernels
52
    cufftComplex *cuda_fft_cache;	// Stored FFT's of the template image
11 by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches
53
54
    float *cuda_lsum_cache;		// Cache of local sums
55
    float *cuda_denom_cache;		// Cahce of denoms
56
57
    float *cuda_lsum_temp;		// Temporary buffer for local sum comp, first two pre-zeroed
58
    
19 by Suren A. Chilingaryan
Provide stand-alone library
59
    
60
    float *res_x;			// External points buffer
61
    float *res_y;			// External points buffer
62
    
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
63
64
    int fft_initialized;		// Flag indicating if CUFFT plan is initialized
65
    cufftHandle cufft_plan;
66
    cufftHandle cufft_r2c_plan;
67
    cufftHandle cufft_c2r_plan;
68
    
69
    int cudpp_initialized;		// Flag indicating if CUDPP plan is initialized
70
    CUDPPHandle cudpp_plan;
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
71
72
    int matlab_mode;		// 2D stored transposed (i.e. by columns)
73
        
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
74
    int mode;			// 1 - image mode, 0 - fragment mode
75
    int base_mode;		// 1 - image mode, 0 - fragment mode
76
    
77
    float minx,miny,maxx,maxy;	// Coordinates of actualy used rectangle of image
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
78
79
    int ncp;			// Number of control points
7 by Suren A. Chilingaryan
FindPeak optimization
80
    int ncp_alloc_size;
19 by Suren A. Chilingaryan
Provide stand-alone library
81
    
82
    int width;			// Images width
83
    int height;			// Images height
7 by Suren A. Chilingaryan
FindPeak optimization
84
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
85
    int corr_size;		// CORR_SIZE 
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
86
    int precision;		// PRECISION
6 by Suren A. Chilingaryan
A little more computations are moved to CUDA
87
    
88
    int side_alloc_size;	// allocation size for 1 side of fft
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
89
90
    int fft_size;		// Matrix Size for FFT (base_size + input_size - 1)
16 by Suren A. Chilingaryan
Optimize FFT size
91
    int fft_real_size;		// Rounded to next power of 2
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
92
    int fft_alloc_size;		// cuda optimized size2
93
    
94
    int subimage_size;		// Size of neighborhood (4*corr_size + 1)
95
    int lsum_size;		// Dimmensions of local sums (2*corr_size + 1)
96
    int lsum_temp_size;		// Matrix Size for computing local sums
97
    int lsum_alloc_size;	// Size of allocated line to store lsum row
98
    int lsum_aligned_size;	// CUDA optimized lsum_temp_size
99
    int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
100
17 by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2
101
    int side_blocks_power;	// Indicates if amount of side blocks is power of 2
102
    int base_blocks_power;	// Indicates if amount of base blocks is power of 2
20 by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application
103
104
105
    DICTImageType image_type;
106
    void *image_buf;		// Temporary buffer for images (specific-format)
107
    unsigned char *image;	// Reduced format
21 by Suren A. Chilingaryan
Collection of timing information and fix for a crash in non-matlab mode
108
109
#ifdef DICT_HW_MEASURE_TIMINGS
25 by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups
110
    int time[24];		// Timing counters (SHOULD BE LAST in struct!!!): 
21 by Suren A. Chilingaryan
Collection of timing information and fix for a crash in non-matlab mode
111
				//  0- 3: hardware, init, set_base_points, set_cur_points
112
				//  4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
113
				// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
114
				// 16   :get_results
115
#endif /* DICT_HW_MEASURE_TIMINGS */
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
116
};
117
118
typedef struct STProcessingState TProcessingState;
119
8 by Suren A. Chilingaryan
Complete elimination of cpcorr
120
121
4 by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board
122
#endif /* NORMXCORR_HW_H */