/normxcorr/trunk : contents of cuda/normxcorr

: (revision 15)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

#include <stdio.h>
#include <stdlib.h>

#include "normxcorr_hw.h"
#include "local_sum.h"

#include "normxcorr_hw_msg.h"
#include "normxcorr_hw_kernel.cu"

#define max4(a,b,c,d) max2(max2(a,b),max2(c,d))
#define max3(a,b,c) max2(max2(a,b),c)
#define max2(a,b) (((a)>(b))?(a):(b))
#define min2(a,b) (((a)<(b))?(a):(b))

#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))
#define calc_blocks(size,rounding) (((size)/(rounding)) + (((size)%(rounding))?1:0))

static const char debruijn[32] = {
    0,  1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
    31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10, 9
};

static TProcessingState *pstate = NULL;

static void fftFree(TProcessingState *ps) {
    if (ps->coords) mxDestroyArray(ps->coords);
    if (ps->banlist) free(ps->banlist);
    if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
	
    if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
    if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
    if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
    
    if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
    if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
	
    if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
    if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
    if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
	
    if (ps->cuda_points) cudaFree(ps->cuda_points);
    if (ps->points) cudaFreeHost(ps->points);

	// DS: Source of bug, that occasionaly can corrupt something ...
    if (ps->cudpp_initialized) {
	cudppDestroyPlan(ps->cudpp_plan);
    }

    if (ps->fft_initialized) {
	cufftDestroy(ps->cufft_r2c_plan);
	cufftDestroy(ps->cufft_c2r_plan);
    }
    
    memset(ps, 0, sizeof(TProcessingState));

}

#include <unistd.h>
static int fftInit(TProcessingState *ps) {
    CUDPPConfiguration cudpp_config;
    
    CUDPPResult cudpp_err;
    cufftResult cufft_err;
    cudaError cuda_err;

    int size;
    int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
    int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;
    

    cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);
    if (cufft_err) {
	reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);
	return ERROR_CUFFT;
    }	
    
    cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_size, ps->fft_size, CUFFT_C2R);
    if (cufft_err) {
	reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);
	cufftDestroy(ps->cufft_r2c_plan);
	return ERROR_CUFFT;
    }

    ps->fft_initialized = true;

    cudpp_config.algorithm = CUDPP_SCAN;
    cudpp_config.options = CUDPP_OPTION_FORWARD |  CUDPP_OPTION_INCLUSIVE;
    cudpp_config.op = CUDPP_ADD;
    cudpp_config.datatype = CUDPP_FLOAT;

    cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
    if (cudpp_err != CUDPP_SUCCESS) {
	reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
	fftFree(ps);
	return ERROR_CUDPP;
    }
    
    ps->cudpp_initialized = true;

    cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }


    size = max3(
	(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex),		/* FFT multiplication */
	2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t),			/* Sum, Std computations */
	CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float))	/* Max of correlation */
    );

    cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);
    if (cuda_err) {
	reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }
    
    ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));
    if (!ps->banlist) {
	reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);
	fftFree(ps);
	return ERROR_MALLOC;
    }
    memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
    
    cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);
    if (cuda_err) {
	reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
    if (cuda_err) {
	reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

    cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*cufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2  * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);
	fftFree(ps);
	return ERROR_MALLOC;
    }
	// We need to zero temporary buffers as well, since we are not computing
	// cumsum of complete matrix, but non-zero part of it
    cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));

    ps->coords = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);
    if (ps->coords) {
	mexMakeArrayPersistent(ps->coords);
    } else {
	reportError("Allocation of Matlab matrix of size %u*float bytes is failed", ps->ncp);
	fftFree(ps);
	return ERROR_MALLOC;
    }

    //mexMakeMemoryPersistent(ps->coords);
    //mexLock() mexUnlock()
    
    return 0;
}

static void fftPrepare(TProcessingState *ps) {
}



void pstateFree(TProcessingState *ps) {
    if (ps) {
	fftFree(ps);
	free(ps);
    }
}

TProcessingState *pstateInit() {
    TProcessingState *ps;
    
    ps = (TProcessingState*)malloc(sizeof(TProcessingState));
    if (ps) memset(ps, 0, sizeof(TProcessingState));
    return ps;
}

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
    int width = mxGetN(image);
    int height = mxGetM(image);

    int check_mode = ((ps->base_mode)&&(!ps->mode));
    float minx, miny, maxx, maxy;

    int precision = ps->precision;

    int half_size = 2 * ps->corr_size;
    int size = 2 * half_size + 1;

    int fft_size = ps->fft_size;
    
    int ncp_alloc = ps->ncp_alloc_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    uint8_t *banlist = ps->banlist + icp;
    
    float *data_x = ps->points + icp;
    float *data_y = data_x + ncp_alloc;

    float *frac_x = ps->points + 4 * ncp_alloc + icp;
    float *frac_y = frac_x + ncp_alloc;

    uint8_t *fullimg = ((uint8_t*)mxGetData(image));
    uint8_t *img = ps->input_buffer;

    float *lsum_temp = (float*)ps->cuda_lsum_temp;
    int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

    if (check_mode) {
	minx = ps->minx;
	maxx = ps->maxx;
	miny = ps->miny;
	maxy = ps->maxy;
    }
    
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
    cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
    cufftComplex *cache = ps->cuda_fft_cache +  icp * alloc_size;
    float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
    float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;

    int blocks = calc_blocks(size, BLOCK_SIZE_1D);
    int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
    
    char blocks_power;
    if ((blocks&(blocks-1))) {
	blocks_power = -1;
    } else {
	blocks_power = debruijn[((uint32_t)blocks * 0x077CB531) >> 27];
    }

    int lsum_size = ps->lsum_size;
    int lsum_alloc = ps->lsum_alloc_size;

    cudaStream_t stream[2];
    for (int i = 0; i < 2; ++i) {
	cudaStreamCreate(&stream[i]);
    }

    for (int i = 0;i <= ncp;i++) {
      if (i < ncp) {
	float x = data_x[i] - 1;
	float y = data_y[i] - 1;

	frac_x[i] = x - round(x * precision) / precision;
	frac_y[i] = y - round(y * precision) / precision;
    
	int xstart = roundf(x) - half_size;
	int ystart = roundf(y) - half_size;
    
	int xend = xstart + size;
	int yend = xstart + size;

	if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
	    continue;
	}
	
	if (check_mode) {
	    if (xstart < minx) minx = xstart;
	    if (ystart < miny) miny = ystart;
	    if (xend > maxx) maxx = xend;
	    if (yend > maxy) maxy = yend;
	}

	cudaMemcpy2D(
	    img + i * alloc_size,
	    size * sizeof(uint8_t),
	    fullimg + (xstart * height + ystart),
	    height * sizeof(uint8_t),
	    size * sizeof(uint8_t),
	    size,
	    cudaMemcpyHostToHost
	);
	
	cudaMemcpy2DAsync(
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
	    img + i * alloc_size, size * sizeof(uint8_t),
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,
	    stream[i%2]
	);

	banlist[i] = 0;
      }
      if (i > 0) {
        int j = i - 1;
	
	if (blocks_power < 0) {
	    vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(
		cuda_input_buffer + j * side_alloc2, side_alloc, 
	        cuda_base_buffer + j*alloc_size, fft_size, 
		lsum_temp + lsum_size * (lsum_alloc + 1), 
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
		lsum_alloc,
	        size, blocks
	    );
	} else {
	    vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(
		cuda_input_buffer + j * side_alloc2, side_alloc, 
	        cuda_base_buffer + j*alloc_size, fft_size, 
		lsum_temp + lsum_size * (lsum_alloc + 1), 
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
		lsum_alloc,
	        size, blocks_power
	    );
	}

	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
	local_sum(ps, 
	    lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
	    lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
	    lsum_temp, lsum_temp + lsum_step,
	    stream[j%2]);

//	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
      }
    }

    for (int j = 0;j < ncp;j++) {
	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
    }

    for (int i = 0; i < 2; ++i) {
	cudaStreamDestroy(stream[i]);
    }



    if (check_mode) {
	ps->minx = minx;
	ps->maxx = maxx;
	ps->miny = miny;
	ps->maxy = maxy;
    }
    

    return 0;
}

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
//    return 0;
    
    int width = mxGetN(image);
    int height = mxGetM(image);

    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;
    int size2 = size * size;

    int fft_size = ps->fft_size;
    int fft_size2 = fft_size * fft_size;
    
    int ncp_alloc = ps->ncp_alloc_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;
    
    float *data_x, *data_y;
    if (ps->stored) {
	data_x = ((float*)mxGetData(ps->coords)) + icp;
	data_y = data_x + ps->ncp;
    } else {
	data_x = ps->points + 2 * ncp_alloc + icp;
	data_y = data_x + ncp_alloc;
    }

    uint8_t *fullimg = ((uint8_t*)mxGetData(image));

    uint8_t *img = ps->input_buffer;
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
    float *cuda_data_buffer = ps->cuda_data_buffer;
    
    uint8_t *banlist = ps->banlist + icp;

    for (int i = 0;i < ncp;i++) {
	float x = data_x[i] - 1;
	float y = data_y[i] - 1;
    
	int xstart = roundf(x) - half_size;
	int ystart = roundf(y) - half_size;
    
	int xend = xstart + size;
	int yend = xstart + size;

	if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
	    banlist[i] = 1;
	    continue;
	}

	cudaMemcpy2D(
	    img + i * size2,//alloc_size,
	    size * sizeof(uint8_t),
	    fullimg + (xstart * height + ystart),
	    height * sizeof(uint8_t),
	    size * sizeof(uint8_t),
	    size,
	    cudaMemcpyHostToHost
	);
/*
	cudaMemcpy2D(
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
	    img + i * size2, size * sizeof(uint8_t),
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
	);
*/
    }

    cudaMemcpy3DParms copy_params = { 0 };
    copy_params.dstPtr   = make_cudaPitchedPtr(
	cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
    );
    copy_params.srcPtr   = make_cudaPitchedPtr(
	img, size * sizeof(uint8_t), size, size
    );
    copy_params.extent   = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
    copy_params.kind     = cudaMemcpyHostToDevice;
    cudaMemcpy3D(&copy_params);


    dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
    dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
    int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
    int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
    int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

	// Computing sum and std
    int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

    float *sumbuf = ps->cuda_points + icp;
    float *stdbuf = ps->cuda_points + ps->ncp_alloc_size + icp;

    dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
    stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);
    stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

	// Packing input data for FFT
    dim3 input_grid_dim(input_blocks, cp_blocks, 1);

    char side_blocks_power;
    if ((side_blocks&(side_blocks-1))) {
	side_blocks_power = -1;
    } else {
	side_blocks_power = debruijn[((uint32_t)side_blocks * 0x077CB531) >> 27];
    }
//    printf("power %i\n", side_blocks_power);

//    cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
    if (side_blocks_power < 0) {
        vecPack<<<input_grid_dim, block_side_cp>>>(
	    cuda_input_buffer, side_alloc2, side_alloc, 
	    cuda_data_buffer, alloc_size, fft_size, 
	    size, side_blocks
	);
    } else {
        vecPackFast<<<input_grid_dim, block_side_cp>>>(
	    cuda_input_buffer, side_alloc2, side_alloc, 
	    cuda_data_buffer, alloc_size, fft_size, 
	    size, side_blocks_power
	);
    }

	// Performing FFT's
    cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

    for (int i = 0;i < ncp;i++) {
	if (banlist[i]) continue;
	cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
    }

    int complex_blocks = calc_blocks(fft_size * (fft_size / 2 + 1), SIDE_BLOCK_SIZE);
    dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
    vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_size/2+1);


        // First in-place transform for some reason is failing, therefore we
	// have one alloc_size spacing between starts
    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
    for (int i = 0;i < ncp;i++) {
	if (banlist[i]) continue;
	cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size,  cuda_result_buffer + i * alloc_size);
    }

    float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
    
    int fft2_blocks = calc_blocks(fft_size*fft_size, SIDE_BLOCK_SIZE);
    dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);
    vecCompute<<<compute_grid_dim, block_side_cp>>>(
	cuda_final_buffer,
	cuda_result_buffer, 1./(fft_size2 * (size2 - 1)),
	ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
	alloc_size, fft_size
    );

	// Looking for maximum
    float *xbuf = sumbuf;
    float *ybuf = stdbuf;

    int32_t *posbuf = (int*)ps->cuda_temp_buffer;
    float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

    dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
    find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
    find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);

    return 0;
}

static inline mxArray *fftGetPoints(TProcessingState *ps) {
    float frac;

    int ncp = ps->ncp;
    int ncp_alloc = ps->ncp_alloc_size;
    int precision = ps->precision;

    uint8_t *banlist = ps->banlist;

    float *res_x = (float*)mxGetData(ps->coords);
    float *res_y = res_x + ncp;

    float *data_x, *data_y;
    if (ps->stored) {
	data_x = res_x;
	data_y = res_y;
    } else {
	data_x = ps->points + 2 * ncp_alloc;
	data_y = data_x + ncp_alloc;
    }


    float *frac_x = ps->points + 4 * ncp_alloc;
    float *frac_y = frac_x + ncp_alloc;

    float *move_x = ps->points + 6 * ncp_alloc;
    float *move_y = move_x + ncp_alloc;

    cudaMemcpy2D(
	move_x, ncp_alloc * sizeof(float),
	ps->cuda_points, ncp_alloc * sizeof(float),
	ps->ncp * sizeof(float), 2,
	cudaMemcpyDeviceToHost
    );

    for (int i = 0;i < ncp;i++) {
	if (banlist[i]) {
	    res_x[i] = data_x[i];
	    res_y[i] = data_y[i];
	    continue;
	}
	
	frac = data_x[i] - round(data_x[i]*precision)/precision;
	res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

	frac = data_y[i] - round(data_y[i]*precision)/precision;
	res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
    }

    ps->stored = 1;
    
#ifdef USE_UNDOCUMENTED
    return mxCreateSharedDataCopy(ps->coords);
//    mxArray *mxCreateSharedDataCopy(const mxArray *pr);
//    bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy);    // true if not successful
//    mxArray *mxUnreference(const mxArray *pr);
#else /* USE_UNDOCUMENTED */
    return mxDuplicateArray(ps->coords);
#endif /* USE_UNDOCUMENTED */
}


#ifdef VALIDATE_LSUM
static inline int fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
    uint8_t *dataPtr;

    if (!ps->fft_initialized) {
	reportError("cuFFT engine is not initialized yet");
	return NULL;
    }
    
    int size = ps->fft_size;
    int alloc_size = ps->fft_alloc_size;

    int N = mxGetM(data);
    int N2 = N * N;

    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    dim3 input_block_dim(N, 1, 1);
    dim3 input_grid_dim(N, 1, 1);

    uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;
    cufftReal *cudaRealPtr = ps->cuda_base_buffer;

    dataPtr = (uint8_t*)mxGetData(data);
    cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

    float *lsum_temp = ps->cuda_lsum_temp;
    int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

    cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
    cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

    vecPackBase<<<input_grid_dim, input_block_dim>>>(
	cudaInputPtr, N,
	cudaRealPtr, size, 
	lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
    );

	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
    local_sum(ps, 
	ps->cuda_lsum_cache + icp * alloc_size, ps->cuda_denom_cache + icp * alloc_size,
	lsum_temp + (2 * step), lsum_temp + (3 * step),
	lsum_temp, lsum_temp + step);

/*
    We don't really want to compute here
    cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_cache + icp * alloc_size);
*/

    return 0;
}
#endif /* VALIDATE_LSUM */

#ifdef VALIDATE_PEAK
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {
    int size = ps->fft_size;
    int size2 = size * size;
    int alloc_size = ps->fft_alloc_size;

    fftLoadFragment(ps, icp, 1, image);

    mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

    float *ar = (float*)mxGetPr(res);

    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
    float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;
    cudaMemcpy(ar, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

    return res;
}

static inline mxArray *fftGetCorrections(TProcessingState *ps) {
    int ncp = ps->ncp;
    int ncp_alloc = ps->ncp_alloc_size;

    float *move_x = ps->points + 6 * ncp_alloc;
    float *move_y = move_x + ncp_alloc;

    cudaMemcpy2D(
	move_x, ncp_alloc * sizeof(float),
	ps->cuda_points, ncp_alloc * sizeof(float),
	ps->ncp * sizeof(float), 2,
	cudaMemcpyDeviceToHost
    );
    
    mxArray *res = mxCreateNumericMatrix(ncp, 2, mxSINGLE_CLASS, mxREAL);
    float *res_x = (float*)mxGetData(res);
    float *res_y = res_x + ncp;

    memcpy(res_x, move_x, ncp * sizeof(float));
    memcpy(res_y, move_y, ncp * sizeof(float));

    return res;
}
#endif /* VALIDATE_PEAK */


static void selfClean() {
    if (pstate) {
	reportMessage("Self-cleaning normxcorr_hw instance");

	pstateFree(pstate);
	pstate = NULL;
    }
}


void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    int err;
    int64_t *errPtr;
    
    int deviceCount;
    cudaDeviceProp deviceProp;
    
    mxArray *idMatrix;
    int32_t id, *idPtr;

    TProcessingState *ps;

    TAction action;

    int iprop;
    
    const mxArray *input;
    const mxArray *base;

#ifdef VALIDATE_LSUM
    const mxArray *lsum;
    const mxArray *denom;
    const mxArray *nonzero;
#endif /* VALIDATE_LSUM */

    const mxArray *x, *y;

    unsigned int icp;

    int width, height;
    int size, size2;
    int base_size, base_size2;

    if (!nrhs) {
	reportMessage("Initializing normxcorr_hw instance");

	if (nlhs != 1) {
	    reportError("You should accept a single result from initialization call");
	    return;
	}
	
	if (pstate) {
	    reportError("Only a single calculation process is supported at the moment");
	    return;
	}

	// Initialising, for now a single client is supported only

	idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);
	if (!idMatrix) {
	    reportError("Initialization is failed");
	    return;
	}


	// Detecting cuda devices

	cudaGetDeviceCount(&deviceCount);
	if (deviceCount) {
	    cudaGetDeviceProperties(&deviceProp, 0);
	    if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {
		id = 1;
	    } else { // Hardware capabilities are bellow 1.3
		id = 0;
	    }
	} else { // No cuda device, using software
	    id = -1;
	}

	
	if (id > 0) {
	    pstate = pstateInit();
	    if (!pstate) {
		mxDestroyArray(idMatrix);
	        reportError("State structure initialization is failed");
	        return;
	    }
	} else {
	    pstate = NULL;
	}


	idPtr = (int32_t*)mxGetData(idMatrix);
	idPtr[0] = id;
	
	plhs[0] = idMatrix;
	
	mexAtExit(selfClean);

	return;
    } else {
	if (!pstate) {
	    reportError("Normxcorr_hw should be initialized first");
	    return;
	}

/*
	idMatrix = (mxArray*)prhs[0];
	if ((mxGetClassID(idMatrix) != mxINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {
	    reportError("Invalid parameter is supplied in place of process identificator");
	    return;
	}

	idPtr = (int32_t*)mxGetData(idMatrix);
	if (!idPtr) {
	    reportError("Mex is not able to obtain process identificator");
	    return;
	}
	
	id = *idPtr;
	if (id != 1) {
	    reportError("Invalid process identificator is supplied");
	    return;
	}

        if (!pstate) {
	    reportError("The interface is not initialized");
	    return;
	}
*/
    }


	// Clean request
    if (nrhs == 1) {
	reportMessage("Cleaning normxcorr_hw instance");

	pstateFree(pstate);
	pstate = NULL;

	return;
    }


    ps = pstate;

    action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));

//    reportMessage("Executing normxcorr_hw action: %u", action);

    switch (action) {
#ifdef VALIDATE_PEAK
     case ACTION_COMPUTE_FRAGMENT:
	icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
	plhs[0] = fftCompute(ps, icp, prhs[3]);
	//fftGetCorrections(TProcessingState *ps, mxArray *result) 
     break;
     case ACTION_GET_CORRECTIONS:
	plhs[0] = fftGetCorrections(ps);
     break;
#endif /* VALIDATE_PEAK */
#ifdef VALIDATE_LSUM
     case ACTION_COMPUTE_BASE_FRAGMENT:
	if ((nrhs != 4)&&(nrhs != 7)) {
	    reportError("ComputeBaseFragment action expects 2 arguments, but %i is passed", nrhs - 2);
	    return;
        }

	icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
	if (icp >= ps->ncp) {
	    reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
	    return;
	}

	base = prhs[3];
    
	if (mxGetNumberOfDimensions(base) != 2) {
	    reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
	    return;
	}

	if (mxGetClassID(base) != mxUINT8_CLASS) {
	    reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
	    return;
	}

	if (nrhs == 7) {
	    iprop = ps->fft_size;
	    
	    lsum = prhs[4];
	    denom = prhs[5];
	    nonzero = prhs[6];
	    if (
		(mxGetNumberOfDimensions(lsum) != 2)||
		(mxGetNumberOfDimensions(denom) != 2)||
		(mxGetClassID(lsum) != mxSINGLE_CLASS)||
		(mxGetClassID(denom) != mxSINGLE_CLASS)||
		(mxGetClassID(nonzero) != mxUINT16_CLASS)||
		(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
		(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
	    
	    ) {
		reportError("Invalid properties for base initialization are specified");
		return;
	    }
	} else {
	    lsum = NULL;
	    denom = NULL;
	    nonzero = NULL;
	}
	
	fftUploadBaseData(ps, icp, base);
	local_sum_validate(ps, icp, lsum, denom);
     break;
#endif /* VALIDATE_LSUM */
     case ACTION_COMPUTE:
	if (nrhs != 3) {
	    reportError("Compute action expects 1 argument, but %i is passed", nrhs - 2);
	    return;
        }

	input = prhs[2];
	
	if (mxGetClassID(input) != mxUINT8_CLASS) {
	    reportError("Invalid type of image data, should be 8bit integers");
	    return;
	}
	
	for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
	    err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);
	    if (err) break;
	}
	
     break;
     case ACTION_COMPUTE_BASE:
	if (nrhs != 3) {
	    reportError("ComputeBase action expects 1 argument, but %i is passed", nrhs - 2);
	    return;
        }

	icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
	if (icp >= ps->ncp) {
	    reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
	    return;
	}

	base = prhs[2];
    
	if (mxGetNumberOfDimensions(base) != 2) {
	    reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
	    return;
	}

	if (mxGetClassID(base) != mxUINT8_CLASS) {
	    reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
	    return;
	}

	width = mxGetN(base);
	height = mxGetM(base);

	size = 2 * ps->corr_size + 1;
	size2 = size * size;

	base_size = 4 * ps->corr_size + 1;
	base_size2 = base_size * base_size;
        
	if (width * height > ps->ncp * size2) {
	    ps->mode = 0;
	} else {
	    ps->mode = 1;
	}

	// if not enoguh space for caching enable anyway ?
	if (width * height > ps->ncp * base_size2) {
	    ps->base_mode = 0;
	} else {
	    ps->base_mode = 1;
	    if (!ps->mode) {
    		ps->minx = 0;
	        ps->maxx = width - 1;
		ps->miny = 0;
	        ps->maxy = height - 1;
	    }
	}

	for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
	    err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), base);
	    if (err) break;
	}
	
	if ((ps->base_mode)&&(!ps->mode)) {
//	    printf("%ux%u\n", width, height);

		// Correcting difference of area size between base and data images
	    ps->minx += ps->corr_size;
	    ps->miny += ps->corr_size;
	    ps->maxx -= ps->corr_size;
	    ps->maxy -= ps->corr_size;
	    
	    width = ceil(ps->maxx) - floor(ps->minx);
	    height = ceil(ps->maxy) - floor(ps->miny);
	    
//	    printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);
    	    if (width * height < ps->ncp * size2) {
		ps->mode = 1;
	    }
	}

	if (ps->mode) {
	    reportMessage("Running in the image mode");
	} else {
	    reportMessage("Running in the fragment mode");
	}
     break;
     case ACTION_SET_BASE_POINTS:
	if (nrhs != 4) {
	    reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
	    return;
	}

        x = prhs[2];
	y = prhs[3];
	
	if (	(mxGetClassID(x) != mxSINGLE_CLASS)||
		(mxGetClassID(y) != mxSINGLE_CLASS)||
		(mxGetN(x)*mxGetM(x) != ps->ncp)||
		(mxGetN(y)*mxGetM(y) != ps->ncp)
	) {
	    reportError("Invalid control points are specified");
	    return;
	}
	
	memcpy(ps->points,                      mxGetData(x), ps->ncp * sizeof(float));
	memcpy(ps->points + ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));
     break;
     case ACTION_SET_POINTS:
	if (nrhs != 4) {
	    reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
	    return;
	}

        x = prhs[2];
	y = prhs[3];
	
	if (	(mxGetClassID(x) != mxSINGLE_CLASS)||
		(mxGetClassID(y) != mxSINGLE_CLASS)||
		(mxGetN(x)*mxGetM(x) != ps->ncp)||
		(mxGetN(y)*mxGetM(y) != ps->ncp)
	) {
	    reportError("Invalid control points are specified");
	    return;
	}

	memcpy(ps->points + 2 * ps->ncp_alloc_size, mxGetData(x), ps->ncp * sizeof(float));
	memcpy(ps->points + 3 * ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));

	ps->stored = 0;
     break;
     case ACTION_GET_POINTS:
        if (nrhs != 2) {
	    reportError("GetPoints action do not expect any arguments");
	    return;
	}
        if (nlhs != 1) {
	    reportError("GetPoints action returns a single matrix");
	    return;
	}

	plhs[0] = fftGetPoints(ps);
     break;     
     case ACTION_SETUP:
	if (nrhs != 5) {
	    reportError("SETUP action expects 'ncp', 'corrsize', and 'precision' parameters");
	    return;
	}
	
	fftFree(ps);
	
	ps->ncp = (int)mxGetScalar(prhs[2]);

	ps->precision = (int)mxGetScalar(prhs[4]);

	iprop = (int)mxGetScalar(prhs[3]);
	ps->corr_size = iprop;
	ps->fft_size = 6 * iprop + 1;
	ps->subimage_size = ps->corr_size * 4 + 1;

	ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);
	ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);
	ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

	ps->lsum_size = ps->corr_size * 2 + 1;
	ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
    
	ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);
	ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);
	ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);
	
	err = fftInit(ps);

	if (nlhs == 1) {
	    idMatrix = mxCreateNumericMatrix(1, 1, mxINT64_CLASS, mxREAL);
	    if (idMatrix) {
		errPtr = (int64_t*)mxGetData(idMatrix);
		errPtr[0] = err;
		plhs[0] = idMatrix;
	    } else {
		reportError("Initialization of result matrix is failed");
	        return;
	    }
	}
     break;
     case ACTION_PREPARE:
        fftPrepare(ps);
     break;
     default:
        reportError("Unknown request %i", action);
    }
}