/normxcorr/trunk : revision 4

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.h

Committer: Suren A. Chilingaryan
Date: 2009-12-02 05:08:22 UTC
Revision ID: csa@dside.dyndns.org-20091202050822-n6ouznm1zp2n2i5l

Instead of transfer compute local sums and denormals on board

files added:
cuda/INFO

cuda/local_sum.cu

cuda/local_sum.h

cuda/local_sum_kernel.cu

cuda/normxcorr_hw.h

files modified:
automate_image.m

cuda/Makefile

cuda/normxcorr_hw.cu

cuda/normxcorr_hw_kernel.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.h

#ifndef NORMXCORR_HW_H

#define NORMXCORR_HW_H

#if defined(_WIN32) || defined(_WIN64)

# include <windows.h>

typedef UINT8 uint8_t;

typedef UINT16 uint16_t;

typedef UINT32 uint32_t;

typedef INT8 int8_t;

typedef INT16 int16_t;

typedef INT32 int32_t;

#else

# include <stdint.h>

#endif

#include <cuda.h>

#include <cuda_runtime.h>

#include <cublas.h>

#include <cufft.h>

#include <cudpp.h>

#define BLOCK_SIZE_1D 64

#define BLOCK_SIZE_2D 16

//#define VALIDATE_LSUM

typedef enum {

ACTION_SETUP = 1,

ACTION_PREPARE = 2,

ACTION_COMPUTE_BASE = 10,

ACTION_COMPUTE_FRAGMENT = 11,

} TAction;

typedef enum {

ERROR_CUFFT = 1,

ERROR_CUDA_MALLOC = 2,

ERROR_MALLOC = 3,

ERROR_CUDPP = 4

} TError;

struct STProcessingState {

cufftComplex *cuda_base_buffer; // Stored FFT's of the template image

cufftComplex *cuda_data_buffer; // Main computational buffer

cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs

cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs

float *cuda_final_buffer; // Ultimate output

uint8_t *cuda_input_buffer; // Input buffer

float *cuda_lsum_temp; // Temporary buffer for local sum comp.

float *cuda_lsum_buffer;

float *cuda_denom_buffer;

int *grid_size;

uint16_t *cuda_nonzero_items;

uint16_t *cuda_nonzero_buffer;

int ncp; // Number of control points

int corr_size; // CORR_SIZE

int fft_size; // Matrix Size for FFT (base_size + input_size - 1)

int fft_size2; // size * size

int fft_alloc_size; // cuda optimized size2

int fft_inner_size; // size * (size/2 + 1), R2C/C2R

int subimage_size; // Size of neighborhood (4*corr_size + 1)

int lsum_size; // Dimmensions of local sums (2*corr_size + 1)

int lsum_temp_size; // Matrix Size for computing local sums

int lsum_alloc_size; // Size of allocated line to store lsum row

int lsum_aligned_size; // CUDA optimized lsum_temp_size

int lsum_short_aligned_size;// CUDA optimized lsum_temp_size - lsum_size - 1

int fft_initialized; // Flag indicating if CUFFT plan is initialized

cufftHandle cufft_plan;

cufftHandle cufft_r2c_plan;

cufftHandle cufft_c2r_plan;

int cudpp_initialized; // Flag indicating if CUDPP plan is initialized

CUDPPHandle cudpp_plan;

};

typedef struct STProcessingState TProcessingState;

#endif /* NORMXCORR_HW_H */

Older »