bzr branch
http://suren.me/webbzr/normxcorr/trunk
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
1 |
#ifndef NORMXCORR_HW_H
|
2 |
#define NORMXCORR_HW_H
|
|
3 |
#include <cuda.h> |
|
4 |
#include <cuda_runtime.h> |
|
5 |
||
6 |
#include <cublas.h> |
|
7 |
#include <cufft.h> |
|
8 |
||
9 |
#include <cudpp.h> |
|
10 |
||
20
by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application |
11 |
#include "dict_image.h" |
12 |
||
22
by Suren A. Chilingaryan
Optimize image reduction |
13 |
#define TRANSPOSE_SIZE 64 // should equal to CPU cache line size |
14 |
||
10
by Suren A. Chilingaryan
Block computational kernels |
15 |
#define BLOCK_SIZE_1D 64
|
16 |
#define BLOCK_SIZE_2D 16
|
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
17 |
|
6
by Suren A. Chilingaryan
A little more computations are moved to CUDA |
18 |
#define CP_BLOCK 256 // should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD |
19 |
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
|
|
20 |
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D
|
|
21 |
||
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
22 |
#define USE_UNDOCUMENTED
|
25
by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups |
23 |
//#define USE_SSE
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
24 |
|
25 |
struct STProcessingState { |
|
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
26 |
int stored; // flag indicating if we already have coordinates in coords stored |
11
by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches |
27 |
|
19
by Suren A. Chilingaryan
Provide stand-alone library |
28 |
unsigned char *banlist; // control points banned from computations for various reasons |
11
by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches |
29 |
|
30 |
||
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
31 |
float *points; // various information on control points |
32 |
// base_x, base_y, data_x, data_y
|
|
33 |
// base_frac_x, base_frac_y
|
|
34 |
// move_x, move_y
|
|
35 |
||
11
by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches |
36 |
|
37 |
float *cuda_points; // Various information on control points: |
|
38 |
// 0: data_x
|
|
39 |
// 1: data_y
|
|
40 |
// 2: sum
|
|
41 |
// 3: denom
|
|
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
42 |
|
19
by Suren A. Chilingaryan
Provide stand-alone library |
43 |
unsigned char *input_buffer; // Input Image buffer / Host |
44 |
unsigned char *cuda_input_buffer; // Input Image buffer / Device |
|
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
45 |
|
46 |
cufftReal *cuda_base_buffer; // Temporary buffer for FFT inputs, pre-zeroed |
|
47 |
cufftReal *cuda_data_buffer; // Temporary buffer for FFT inputs, pre-zeroed |
|
48 |
||
49 |
void *cuda_temp_buffer; // Main computational buffer, temporary |
|
50 |
||
11
by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches |
51 |
|
10
by Suren A. Chilingaryan
Block computational kernels |
52 |
cufftComplex *cuda_fft_cache; // Stored FFT's of the template image |
11
by Suren A. Chilingaryan
Enforce naming conventions for buffers and caches |
53 |
|
54 |
float *cuda_lsum_cache; // Cache of local sums |
|
55 |
float *cuda_denom_cache; // Cahce of denoms |
|
56 |
||
57 |
float *cuda_lsum_temp; // Temporary buffer for local sum comp, first two pre-zeroed |
|
58 |
||
19
by Suren A. Chilingaryan
Provide stand-alone library |
59 |
|
60 |
float *res_x; // External points buffer |
|
61 |
float *res_y; // External points buffer |
|
62 |
||
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
63 |
|
64 |
int fft_initialized; // Flag indicating if CUFFT plan is initialized |
|
65 |
cufftHandle cufft_plan; |
|
66 |
cufftHandle cufft_r2c_plan; |
|
67 |
cufftHandle cufft_c2r_plan; |
|
68 |
||
69 |
int cudpp_initialized; // Flag indicating if CUDPP plan is initialized |
|
70 |
CUDPPHandle cudpp_plan; |
|
20
by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application |
71 |
|
72 |
int matlab_mode; // 2D stored transposed (i.e. by columns) |
|
73 |
||
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
74 |
int mode; // 1 - image mode, 0 - fragment mode |
75 |
int base_mode; // 1 - image mode, 0 - fragment mode |
|
76 |
||
77 |
float minx,miny,maxx,maxy; // Coordinates of actualy used rectangle of image |
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
78 |
|
79 |
int ncp; // Number of control points |
|
7
by Suren A. Chilingaryan
FindPeak optimization |
80 |
int ncp_alloc_size; |
19
by Suren A. Chilingaryan
Provide stand-alone library |
81 |
|
82 |
int width; // Images width |
|
83 |
int height; // Images height |
|
7
by Suren A. Chilingaryan
FindPeak optimization |
84 |
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
85 |
int corr_size; // CORR_SIZE |
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
86 |
int precision; // PRECISION |
6
by Suren A. Chilingaryan
A little more computations are moved to CUDA |
87 |
|
88 |
int side_alloc_size; // allocation size for 1 side of fft |
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
89 |
|
90 |
int fft_size; // Matrix Size for FFT (base_size + input_size - 1) |
|
16
by Suren A. Chilingaryan
Optimize FFT size |
91 |
int fft_real_size; // Rounded to next power of 2 |
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
92 |
int fft_alloc_size; // cuda optimized size2 |
93 |
||
94 |
int subimage_size; // Size of neighborhood (4*corr_size + 1) |
|
95 |
int lsum_size; // Dimmensions of local sums (2*corr_size + 1) |
|
96 |
int lsum_temp_size; // Matrix Size for computing local sums |
|
97 |
int lsum_alloc_size; // Size of allocated line to store lsum row |
|
98 |
int lsum_aligned_size; // CUDA optimized lsum_temp_size |
|
99 |
int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1 |
|
100 |
||
17
by Suren A. Chilingaryan
Precompute if side and base blocks amount is power of 2 |
101 |
int side_blocks_power; // Indicates if amount of side blocks is power of 2 |
102 |
int base_blocks_power; // Indicates if amount of base blocks is power of 2 |
|
20
by Suren A. Chilingaryan
Support for TIFF images in C code and stand-alone console application |
103 |
|
104 |
||
105 |
DICTImageType image_type; |
|
106 |
void *image_buf; // Temporary buffer for images (specific-format) |
|
107 |
unsigned char *image; // Reduced format |
|
21
by Suren A. Chilingaryan
Collection of timing information and fix for a crash in non-matlab mode |
108 |
|
109 |
#ifdef DICT_HW_MEASURE_TIMINGS
|
|
25
by Suren A. Chilingaryan
Count hardware initialization time, cmake scripts fixups |
110 |
int time[24]; // Timing counters (SHOULD BE LAST in struct!!!): |
21
by Suren A. Chilingaryan
Collection of timing information and fix for a crash in non-matlab mode |
111 |
// 0- 3: hardware, init, set_base_points, set_cur_points
|
112 |
// 4-11: comp_base, comp_image, (copy_fragment, load_fragment, preprocess, process, postprocess, reserve)
|
|
113 |
// 12-15: load_base_tiff, base_image_reduction, load_tiff, image_reduction
|
|
114 |
// 16 :get_results
|
|
115 |
#endif /* DICT_HW_MEASURE_TIMINGS */ |
|
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
116 |
};
|
117 |
||
118 |
typedef struct STProcessingState TProcessingState; |
|
119 |
||
8
by Suren A. Chilingaryan
Complete elimination of cpcorr |
120 |
|
121 |
||
4
by Suren A. Chilingaryan
Instead of transfer compute local sums and denormals on board |
122 |
#endif /* NORMXCORR_HW_H */ |