4
#if defined(_WIN32) || defined(_WIN64)
7
typedef UINT16 uint16_t;
8
typedef UINT32 uint32_t;
10
typedef INT16 int16_t;
11
typedef INT32 int32_t;
17
#include <cuda_runtime.h>
24
#define BLOCK_SIZE_1D 64
25
#define BLOCK_SIZE_2D 16
27
//#define VALIDATE_LSUM
33
ACTION_COMPUTE_BASE = 10,
34
ACTION_COMPUTE_FRAGMENT = 11,
39
ERROR_CUDA_MALLOC = 2,
44
struct STProcessingState {
45
cufftComplex *cuda_base_buffer; // Stored FFT's of the template image
46
cufftComplex *cuda_data_buffer; // Main computational buffer
47
cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs
48
cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs
49
float *cuda_final_buffer; // Ultimate output
50
uint8_t *cuda_input_buffer; // Input buffer
52
float *cuda_lsum_temp; // Temporary buffer for local sum comp.
54
float *cuda_lsum_buffer;
55
float *cuda_denom_buffer;
59
uint16_t *cuda_nonzero_items;
60
uint16_t *cuda_nonzero_buffer;
63
int ncp; // Number of control points
64
int corr_size; // CORR_SIZE
66
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
67
int fft_size2; // size * size
68
int fft_alloc_size; // cuda optimized size2
69
int fft_inner_size; // size * (size/2 + 1), R2C/C2R
71
int subimage_size; // Size of neighborhood (4*corr_size + 1)
72
int lsum_size; // Dimmensions of local sums (2*corr_size + 1)
73
int lsum_temp_size; // Matrix Size for computing local sums
74
int lsum_alloc_size; // Size of allocated line to store lsum row
75
int lsum_aligned_size; // CUDA optimized lsum_temp_size
76
int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
78
int fft_initialized; // Flag indicating if CUFFT plan is initialized
79
cufftHandle cufft_plan;
80
cufftHandle cufft_r2c_plan;
81
cufftHandle cufft_c2r_plan;
83
int cudpp_initialized; // Flag indicating if CUDPP plan is initialized
84
CUDPPHandle cudpp_plan;
87
typedef struct STProcessingState TProcessingState;
89
#endif /* NORMXCORR_HW_H */