/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#ifndef NORMXCORR_HW_H
#define NORMXCORR_HW_H

#if defined(_WIN32) || defined(_WIN64)
# include <windows.h>
    typedef UINT8 uint8_t;
    typedef UINT16 uint16_t;
    typedef UINT32 uint32_t;
    typedef INT8 int8_t;
    typedef INT16 int16_t;
    typedef INT32 int32_t;
#else
# include <stdint.h>
#endif

#include <cuda.h>
#include <cuda_runtime.h>

#include <cublas.h>
#include <cufft.h>

#include <cudpp.h>

#define BLOCK_SIZE_1D 64
#define BLOCK_SIZE_2D 16

#define CP_BLOCK 256		// should be divisable by CP_BLOCK_SIZE, BLOCK_SIZE_xD
#define CP_BLOCK_SIZE BLOCK_SIZE_2D
#define SIDE_BLOCK_SIZE BLOCK_SIZE_2D


//#define VALIDATE_LSUM
//#define VALIDATE_PEAK

typedef enum {
    ACTION_SETUP = 1,
    ACTION_PREPARE = 2,
    ACTION_SET_POINTS = 3,
    ACTION_GET_POINTS = 4,
    ACTION_COMPUTE_BASE = 10,
    ACTION_COMPUTE_FRAGMENT = 11,
    ACTION_COMPUTE = 12,
} TAction;

typedef enum {
    ERROR_CUFFT = 1,
    ERROR_CUDA_MALLOC = 2,
    ERROR_MALLOC = 3,
    ERROR_CUDPP = 4
} TError;

struct STProcessingState {
    cufftComplex *cuda_base_buffer;	// Stored FFT's of the template image
    void *cuda_data_buffer;		// Main computational buffer, temporary
    cufftReal *cuda_temp_buffer;	// Temporary buffer for FFT inputs, pre-zeroed
    cufftReal *cuda_result_buffer;	// Temporary buffer for FFT outputs
    float *cuda_final_buffer;		// Ultimate output
    uint8_t *cuda_input_buffer;		// Input buffer
    
    float *cuda_lsum_temp;		// Temporary buffer for local sum comp.
    
    float *cuda_lsum_buffer;
    float *cuda_denom_buffer;
    
    float *cuda_cp;			// Various information on control points:
					// 0: data_x
					// 1: data_y
					// 2: sum
					// 3: denom
    
    float *data_x;			// x coordinates of control points
    float *data_y;			// y coordinates of control points
    uint8_t *input_buffer;

    int ncp;			// Number of control points
    int ncp_alloc_size;

    int corr_size;		// CORR_SIZE 
    
    int side_alloc_size;	// allocation size for 1 side of fft

    int fft_size;		// Matrix Size for FFT (base_size + input_size - 1)
    int fft_alloc_size;		// cuda optimized size2
    
    int subimage_size;		// Size of neighborhood (4*corr_size + 1)
    int lsum_size;		// Dimmensions of local sums (2*corr_size + 1)
    int lsum_temp_size;		// Matrix Size for computing local sums
    int lsum_alloc_size;	// Size of allocated line to store lsum row
    int lsum_aligned_size;	// CUDA optimized lsum_temp_size
    int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1

    int fft_initialized;	// Flag indicating if CUFFT plan is initialized
    cufftHandle cufft_plan;
    cufftHandle cufft_r2c_plan;
    cufftHandle cufft_c2r_plan;
    
    int cudpp_initialized;	// Flag indicating if CUDPP plan is initialized
    CUDPPHandle cudpp_plan;
};

typedef struct STProcessingState TProcessingState;

#endif /* NORMXCORR_HW_H */