/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H

#if defined(_WIN32) || defined(_WIN64)
# include <windows.h>
    typedef UINT8 uint8_t;
    typedef UINT16 uint16_t;
    typedef UINT32 uint32_t;
    typedef INT8 int8_t;
    typedef INT16 int16_t;
    typedef INT32 int32_t;
#else
# include <stdint.h>
#endif

#include <mex.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cublas.h>
#include <cufft.h>

#include <cudpp.h>

#define BLOCK_SIZE_1D 64
#define BLOCK_SIZE_2D 16

#define CP_BLOCK 256		// should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D


//#define VALIDATE_LSUM
//#define VALIDATE_PEAK
#define USE_UNDOCUMENTED

typedef enum {
    ACTION_SETUP = 1,
#ifdef VALIDATE_LSUM
    ACTION_COMPUTE_BASE_FRAGMENT = 2,
#endif /* VALIDAT_LSUM */
    ACTION_SET_BASE_POINTS = 3,
    ACTION_COMPUTE_BASE = 4,
    ACTION_PREPARE = 5,
#ifdef VALIDATE_PEAK
    ACTION_COMPUTE_FRAGMENT = 11,
    ACTION_GET_CORRECTIONS = 15,
#endif /* VALIDATE_PEAK */
    ACTION_SET_POINTS = 12,
    ACTION_COMPUTE = 13,
    ACTION_GET_POINTS = 14,
} TAction;

typedef enum {
    ERROR_CUFFT = 1,
    ERROR_CUDA_MALLOC = 2,
    ERROR_MALLOC = 3,
    ERROR_CUDPP = 4
} TError;

struct STProcessingState {
    int stored;				// flag indicating if we already have coordinates in coords stored
    mxArray *coords;			// Matlab array with current coordinates
    float *points;			// various information on control points
					// 	base_x, base_y, data_x, data_y
					//	base_frac_x, base_frac_y
					//	move_x, move_y

    uint8_t *banlist;			// control points banned from computations for various reasons

    uint8_t *input_buffer;		// Input Image buffer / Host
    uint8_t *cuda_input_buffer;		// Input Image buffer / Device

    cufftReal *cuda_base_buffer;	// Temporary buffer for FFT inputs, pre-zeroed
    cufftReal *cuda_data_buffer;	// Temporary buffer for FFT inputs, pre-zeroed

    void *cuda_temp_buffer;		// Main computational buffer, temporary

    cufftComplex *cuda_fft_buffer;	// Stored FFT's of the template image
    
    cufftReal *cuda_result_buffer;	// Temporary buffer for FFT outputs
    float *cuda_final_buffer;		// Ultimate output
    
    float *cuda_lsum_temp;		// Temporary buffer for local sum comp.
    
    float *cuda_lsum_buffer;
    float *cuda_denom_buffer;
    
    float *cuda_cp;			// Various information on control points:
					// 0: data_x
					// 1: data_y
					// 2: sum
					// 3: denom

    int fft_initialized;		// Flag indicating if CUFFT plan is initialized
    cufftHandle cufft_plan;
    cufftHandle cufft_r2c_plan;
    cufftHandle cufft_c2r_plan;
    
    int cudpp_initialized;		// Flag indicating if CUDPP plan is initialized
    CUDPPHandle cudpp_plan;
    
    int mode;			// 1 - image mode, 0 - fragment mode
    int base_mode;		// 1 - image mode, 0 - fragment mode
    
    float minx,miny,maxx,maxy;	// Coordinates of actualy used rectangle of image

    int ncp;			// Number of control points
    int ncp_alloc_size;

    int corr_size;		// CORR_SIZE 
    int precision;		// PRECISION
    
    int side_alloc_size;	// allocation size for 1 side of fft

    int fft_size;		// Matrix Size for FFT (base_size + input_size - 1)
    int fft_alloc_size;		// cuda optimized size2
    
    int subimage_size;		// Size of neighborhood (4*corr_size + 1)
    int lsum_size;		// Dimmensions of local sums (2*corr_size + 1)
    int lsum_temp_size;		// Matrix Size for computing local sums
    int lsum_alloc_size;	// Size of allocated line to store lsum row
    int lsum_aligned_size;	// CUDA optimized lsum_temp_size
    int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1

};

typedef struct STProcessingState TProcessingState;

#ifndef EXTERN_C
# ifdef __cplusplus
   #define EXTERN_C extern "C"
# else
   #define EXTERN_C extern
# endif
#endif

#ifdef USE_UNDOCUMENTED
EXTERN_C mxArray *mxCreateSharedDataCopy(const mxArray *pr);
#endif /* USE_UNDOCUMENTED */

#endif /* NORMXCORR_HW_H */