1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cufft.h>
#include <cudpp.h>
#include "dict_image.h"
#define TRANSPOSE_SIZE 64 // should equal to CPU cache line size
#define BLOCK_SIZE_1D 64
#define BLOCK_SIZE_2D 16
#define CP_BLOCK 256 // should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
#define USE_UNDOCUMENTED
//#define USE_SSE
struct STProcessingState {
int stored; // flag indicating if we already have coordinates in coords stored
unsigned char *banlist; // control points banned from computations for various reasons
float *points; // various information on control points
// base_x, base_y, data_x, data_y
// base_frac_x, base_frac_y
// move_x, move_y
float *cuda_points; // Various information on control points:
// 0: data_x
// 1: data_y
// 2: sum
// 3: denom
unsigned char *input_buffer; // Input Image buffer / Host
unsigned char *cuda_input_buffer; // Input Image buffer / Device
cufftReal *cuda_base_buffer; // Temporary buffer for FFT inputs, pre-zeroed
cufftReal *cuda_data_buffer; // Temporary buffer for FFT inputs, pre-zeroed
void *cuda_temp_buffer; // Main computational buffer, temporary
cufftComplex *cuda_fft_cache; // Stored FFT's of the template image
float *cuda_lsum_cache; // Cache of local sums
float *cuda_denom_cache; // Cahce of denoms
float *cuda_lsum_temp; // Temporary buffer for local sum comp, first two pre-zeroed
float *res_x; // External points buffer
float *res_y; // External points buffer
int fft_initialized; // Flag indicating if CUFFT plan is initialized
cufftHandle cufft_plan;
cufftHandle cufft_r2c_plan;
cufftHandle cufft_c2r_plan;
int cudpp_initialized; // Flag indicating if CUDPP plan is initialized
CUDPPHandle cudpp_plan;
int matlab_mode; // 2D stored transposed (i.e. by columns)
int mode; // 1 - image mode, 0 - fragment mode
int base_mode; // 1 - image mode, 0 - fragment mode
float minx,miny,maxx,maxy; // Coordinates of actualy used rectangle of image
int ncp; // Number of control points
int ncp_alloc_size;
int width; // Images width
int height; // Images height
int corr_size; // CORR_SIZE
int precision; // PRECISION
int side_alloc_size; // allocation size for 1 side of fft
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
int fft_real_size; // Rounded to next power of 2
int fft_alloc_size; // cuda optimized size2
int subimage_size; // Size of neighborhood (4*corr_size + 1)
int lsum_size; // Dimmensions of local sums (2*corr_size + 1)
int lsum_temp_size; // Matrix Size for computing local sums
int lsum_alloc_size; // Size of allocated line to store lsum row
int lsum_aligned_size; // CUDA optimized lsum_temp_size
int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
int side_blocks_power; // Indicates if amount of side blocks is power of 2
int base_blocks_power; // Indicates if amount of base blocks is power of 2
DICTImageType image_type;
void *image_buf; // Temporary buffer for images (specific-format)
unsigned char *image; // Reduced format
#ifdef DICT_HW_MEASURE_TIMINGS
int time[24]; // Timing counters (SHOULD BE LAST in struct!!!):
// 0- 3: hardware, init, set_base_points, set_cur_points
// 4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
// 16 :get_results
#endif /* DICT_HW_MEASURE_TIMINGS */
};
typedef struct STProcessingState TProcessingState;
#endif /* NORMXCORR_HW_H */
|