/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H
#include <cuda.h>
#include <cuda_runtime.h>

#include <cublas.h>
#include <cufft.h>

#include <cudpp.h>

#include "dict_image.h"

#define TRANSPOSE_SIZE 64	// should equal to CPU cache line size

#define BLOCK_SIZE_1D 		64
#define BLOCK_SIZE_2D 		16

#define CP_BLOCK 256		// should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D

#define USE_UNDOCUMENTED
//#define USE_SSE

struct STProcessingState {
    int stored;				// flag indicating if we already have coordinates in coords stored

    unsigned char *banlist;		// control points banned from computations for various reasons


    float *points;			// various information on control points
					// 	base_x, base_y, data_x, data_y
					//	base_frac_x, base_frac_y
					//	move_x, move_y


    float *cuda_points;			// Various information on control points:
					// 0: data_x
					// 1: data_y
					// 2: sum
					// 3: denom

    unsigned char *input_buffer;	// Input Image buffer / Host
    unsigned char *cuda_input_buffer;	// Input Image buffer / Device

    cufftReal *cuda_base_buffer;	// Temporary buffer for FFT inputs, pre-zeroed
    cufftReal *cuda_data_buffer;	// Temporary buffer for FFT inputs, pre-zeroed

    void *cuda_temp_buffer;		// Main computational buffer, temporary

    
    cufftComplex *cuda_fft_cache;	// Stored FFT's of the template image

    float *cuda_lsum_cache;		// Cache of local sums
    float *cuda_denom_cache;		// Cahce of denoms

    float *cuda_lsum_temp;		// Temporary buffer for local sum comp, first two pre-zeroed
    
    
    float *res_x;			// External points buffer
    float *res_y;			// External points buffer
    

    int fft_initialized;		// Flag indicating if CUFFT plan is initialized
    cufftHandle cufft_plan;
    cufftHandle cufft_r2c_plan;
    cufftHandle cufft_c2r_plan;
    
    int cudpp_initialized;		// Flag indicating if CUDPP plan is initialized
    CUDPPHandle cudpp_plan;

    int matlab_mode;		// 2D stored transposed (i.e. by columns)
        
    int mode;			// 1 - image mode, 0 - fragment mode
    int base_mode;		// 1 - image mode, 0 - fragment mode
    
    float minx,miny,maxx,maxy;	// Coordinates of actualy used rectangle of image

    int ncp;			// Number of control points
    int ncp_alloc_size;
    
    int width;			// Images width
    int height;			// Images height

    int corr_size;		// CORR_SIZE 
    int precision;		// PRECISION
    
    int side_alloc_size;	// allocation size for 1 side of fft

    int fft_size;		// Matrix Size for FFT (base_size + input_size - 1)
    int fft_real_size;		// Rounded to next power of 2
    int fft_alloc_size;		// cuda optimized size2
    
    int subimage_size;		// Size of neighborhood (4*corr_size + 1)
    int lsum_size;		// Dimmensions of local sums (2*corr_size + 1)
    int lsum_temp_size;		// Matrix Size for computing local sums
    int lsum_alloc_size;	// Size of allocated line to store lsum row
    int lsum_aligned_size;	// CUDA optimized lsum_temp_size
    int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1

    int side_blocks_power;	// Indicates if amount of side blocks is power of 2
    int base_blocks_power;	// Indicates if amount of base blocks is power of 2


    DICTImageType image_type;
    void *image_buf;		// Temporary buffer for images (specific-format)
    unsigned char *image;	// Reduced format

#ifdef DICT_HW_MEASURE_TIMINGS
    int time[24];		// Timing counters (SHOULD BE LAST in struct!!!): 
				//  0- 3: hardware, init, set_base_points, set_cur_points
				//  4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
				// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
				// 16   :get_results
#endif /* DICT_HW_MEASURE_TIMINGS */
};

typedef struct STProcessingState TProcessingState;



#endif /* NORMXCORR_HW_H */