1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
|
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cufft.h>
#include <cudpp.h>
#include "dict_image.h"
#define DICT_SUPPORT_THREADS
#define TRANSPOSE_SIZE 64 // should equal to CPU cache line size
#define BLOCK_SIZE_1D 64
#define BLOCK_SIZE_2D 16
#define CP_BLOCK 256 // should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
#define MODE_COEFFICIENT 0 // 0 - always fragment, infinity - always image
#define USE_UNDOCUMENTED
#define CUDA_EXTRA_MEMORY 67108864 // 64MB
//#define USE_SSE
#ifdef DICT_SUPPORT_THREADS
# include "hw_sched.h"
#endif /* DICT_SUPPORT_THREADS */
struct STProcessingState {
int stored; // flag indicating if we already have coordinates in coords stored
unsigned char *banlist; // control points banned from computations for various reasons
float *points; // various information on control points, reference to DICTContext
int points_alloc_size; // equal to ncp_alloc_size of DICTContext
float *cuda_points; // Various information on control points:
// 0: data_x
// 1: data_y
// 0: sum
// 1: denom
unsigned char *cuda_base_image;
unsigned char *cuda_image;
unsigned char *input_buffer; // Input Image buffer / Host
unsigned char *cuda_input_buffer; // Input Image buffer / Device
cufftReal *cuda_base_buffer; // Temporary buffer for FFT inputs, pre-zeroed
cufftReal *cuda_data_buffer; // Temporary buffer for FFT inputs, pre-zeroed
void *cuda_temp_buffer; // Main computational buffer, temporary
cufftComplex *cuda_fft_cache; // Stored FFT's of the template image
float *cuda_lsum_cache; // Cache of local sums
float *cuda_denom_cache; // Cahce of denoms
float *cuda_lsum_temp; // Temporary buffer for local sum comp, first two pre-zeroed
float *res_x; // External points buffer, reference to DICTContext
float *res_y; // External points buffer, reference to DICTContext
int fft_initialized; // Flag indicating if CUFFT plan is initialized
cufftHandle cufft_plan;
cufftHandle cufft_r2c_plan;
cufftHandle cufft_c2r_plan;
int cudpp_initialized; // Flag indicating if CUDPP plan is initialized
CUDPPHandle cudpp_plan;
int status; // Error code, 0 if operable
int width; // Images width
int height; // Images height
// Here are starting non-zeroed parameters, so the matlab mode should be first
int matlab_mode; // 2D stored transposed (i.e. by columns)
int use_cache; // 1 - base image caching is enabled or not
int mode; // 1 - image mode, 0 - fragment mode
int base_mode; // 1 - image mode, 0 - fragment mode
float minx,miny,maxx,maxy; // Coordinates of actualy used rectangle of image
float base_minx, base_miny, base_maxx, base_maxy; // Coordinates of actualy used rectangle of base image
int ncp; // Number of control points
int ncp_alloc_size;
int corr_size; // CORR_SIZE
int precision; // PRECISION
int side_alloc_size; // allocation size for 1 side of fft
int fft_size; // Matrix Size for FFT (base_size + input_size - 1)
int fft_real_size; // Rounded to next power of 2
int fft_alloc_size; // cuda optimized size2
int subimage_size; // Size of neighborhood (4*corr_size + 1)
int lsum_size; // Dimmensions of local sums (2*corr_size + 1)
int lsum_temp_size; // Matrix Size for computing local sums
int lsum_alloc_size; // Size of allocated line to store lsum row
int lsum_aligned_size; // CUDA optimized lsum_temp_size
int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1
int side_blocks_power; // Indicates if amount of side blocks is power of 2
int base_blocks_power; // Indicates if amount of base blocks is power of 2
};
typedef struct STProcessingState TProcessingState;
typedef struct STProcessingState *ProcessingState;
struct STDICTContext {
int ncp;
int ncp_alloc_size;
int ncp_per_device;
int width; // Images width
int height; // Images height
int use_threads; // Run in multithreaded mode
DICTImageType image_type;
void *image_buf; // Temporary buffer for images (specific-format)
unsigned char *image; // Reduced format
unsigned char *base_image; // Stored base image
float *points; // various information on control points
// base_x, base_y, data_x, data_y
// base_frac_x, base_frac_y
// move_x, move_y
float *res_x; // External points buffer
float *res_y; // External points buffer
// Here are starting non-zeroed parameters, so the matlab_mode should be first
int matlab_mode; // 2D stored transposed (i.e. by columns)
#ifdef DICT_SUPPORT_THREADS
HWSched sched;
#endif /* DICT_SUPPORT_THREADS */
ProcessingState *pstates;
// Just parameters to be passed to thread functions
int res_do_copy;
const unsigned char *param_img;
#ifdef DICT_HW_MEASURE_TIMINGS
int time[24]; // Timing counters (SHOULD BE LAST in struct!!!):
// 0- 3: hardware, init, set_base_points, set_cur_points
// 4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
// 16 :get_results
#endif /* DICT_HW_MEASURE_TIMINGS */
};
typedef struct STDICTContext TDICTContext;
#endif /* NORMXCORR_HW_H */
|