1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H
#if defined(_WIN32) || defined(_WIN64)
# include <windows.h>
typedef UINT8 uint8_t;
typedef UINT16 uint16_t;
typedef UINT32 uint32_t;
typedef INT8 int8_t;
typedef INT16 int16_t;
typedef INT32 int32_t;
#else
# include <stdint.h>
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cufft.h>
#include <cudpp.h>
#define BLOCK_SIZE_1D 64
#define BLOCK_SIZE_2D 16
#define CP_BLOCK 256 // should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
//#define VALIDATE_LSUM
//#define VALIDATE_PEAK
typedef enum {
ACTION_SETUP = 1,
ACTION_PREPARE = 2,
ACTION_SET_POINTS = 3,
ACTION_GET_POINTS = 4,
ACTION_COMPUTE_BASE = 10,
ACTION_COMPUTE_FRAGMENT = 11,
ACTION_COMPUTE = 12,
} TAction;
typedef enum {
ERROR_CUFFT = 1,
ERROR_CUDA_MALLOC = 2,
ERROR_MALLOC = 3,
ERROR_CUDPP = 4
} TError;
struct STProcessingState {
cufftComplex *cuda_base_buffer; // Stored FFT's of the template image
void *cuda_data_buffer; // Main computational buffer, temporary
cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs, pre-zeroed
cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs
float *cuda_final_buffer; // Ultimate output
uint8_t *cuda_input_buffer; // Input buffer
float *cuda_lsum_temp; // Temporary buffer for local sum comp.
float *cuda_lsum_buffer;
float *cuda_denom_buffer;
float *cuda_cp; // Various information on control points:
// 0: data_x
// 1: data_y
// 2: sum
// 3: denom
float *data_x; // x coordinates of control points
float *data_y; // y coordinates of control points
uint8_t *input_buffer;
int ncp; // Number of control points
int ncp_alloc_size;
int corr_size; // CORR_SIZE
int side_alloc_size; // allocation size for 1 side of fft
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
int fft_alloc_size; // cuda optimized size2
int subimage_size; // Size of neighborhood (4*corr_size + 1)
int lsum_size; // Dimmensions of local sums (2*corr_size + 1)
int lsum_temp_size; // Matrix Size for computing local sums
int lsum_alloc_size; // Size of allocated line to store lsum row
int lsum_aligned_size; // CUDA optimized lsum_temp_size
int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
int fft_initialized; // Flag indicating if CUFFT plan is initialized
cufftHandle cufft_plan;
cufftHandle cufft_r2c_plan;
cufftHandle cufft_c2r_plan;
int cudpp_initialized; // Flag indicating if CUDPP plan is initialized
CUDPPHandle cudpp_plan;
};
typedef struct STProcessingState TProcessingState;
#endif /* NORMXCORR_HW_H */
|