/normxcorr/trunk : contents of dict_hw/src/normxcorr

: (revision 25)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

#include "normxcorr_hw.h"
#include "normxcorr_hw_msg.h"
#include "normxcorr_hw_kernel.cu.h"


static void fftFree(TProcessingState *ps) {
    if (ps->banlist) free(ps->banlist);
    if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);
	
    if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);
    if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);
    if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);
    
    if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);
    if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);
	
    if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);
    if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);
    if (ps->input_buffer) cudaFreeHost(ps->input_buffer);
	
    if (ps->cuda_points) cudaFree(ps->cuda_points);
    if (ps->points) cudaFreeHost(ps->points);

    if (ps->cudpp_initialized) {
	cudppDestroyPlan(ps->cudpp_plan);
    }

    if (ps->fft_initialized) {
	cufftDestroy(ps->cufft_r2c_plan);
	cufftDestroy(ps->cufft_c2r_plan);
    }
    
    if (ps->image_buf) {
	dictImageFree(ps);
    }

#ifdef DICT_HW_MEASURE_TIMINGS
    memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));
#else  /* DICT_HW_MEASURE_TIMINGS */
    memset(ps, 0, sizeof(TProcessingState));
#endif /* DICT_HW_MEASURE_TIMINGS */

}

static int fftInit(TProcessingState *ps) {
    CUDPPConfiguration cudpp_config;
    
    CUDPPResult cudpp_err;
    cufftResult cufft_err;
    cudaError cuda_err;

    int size;
    int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
    int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;
    

    cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);
    if (cufft_err) {
	reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);
	return DICT_ERROR_CUFFT;
    }	
    
    cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_C2R);
    if (cufft_err) {
	reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);
	cufftDestroy(ps->cufft_r2c_plan);
	return DICT_ERROR_CUFFT;
    }

    ps->fft_initialized = true;

    cudpp_config.algorithm = CUDPP_SCAN;
    cudpp_config.options = CUDPP_OPTION_FORWARD |  CUDPP_OPTION_INCLUSIVE;
    cudpp_config.op = CUDPP_ADD;
    cudpp_config.datatype = CUDPP_FLOAT;

    cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
    if (cudpp_err != CUDPP_SUCCESS) {
	reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
	fftFree(ps);
	return DICT_ERROR_CUDPP;
    }
    
    ps->cudpp_initialized = true;

    cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }


    size = max3(
	(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex),		/* FFT multiplication */
	2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t),			/* Sum, Std computations */
	CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float))	/* Max of correlation */
    );

    cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);
    if (cuda_err) {
	reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }
    
    ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));
    if (!ps->banlist) {
	reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);
	fftFree(ps);
	return DICT_ERROR_MALLOC;
    }
    memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));
    
    cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);
    if (cuda_err) {
	reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
    if (cuda_err) {
	reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }
	
	// DS: We don't actually need that to be CP_BLOCK, just unblock computations in loadbase and set to single
    cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

    cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*cufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return DICT_ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2  * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);
	fftFree(ps);
	return DICT_ERROR_MALLOC;
    }
	// We need to zero temporary buffers as well, since we are not computing
	// cumsum of complete matrix, but non-zero part of it
    cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));
        
    return 0;
}


void pstateFree(TProcessingState *ps) {
    if (ps) {
	fftFree(ps);
	free(ps);
    }
}

TProcessingState *pstateInit() {
    TProcessingState *ps;
    
    ps = (TProcessingState*)malloc(sizeof(TProcessingState));
    if (ps) memset(ps, 0, sizeof(TProcessingState));

    return ps;
}

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
    int width = ps->width;
    int height = ps->height;

    int check_mode = ((ps->base_mode)&&(!ps->mode));
    float minx, miny, maxx, maxy;

    int precision = ps->precision;

    int half_size = 2 * ps->corr_size;
    int size = 2 * half_size + 1;

    int fft_real_size = ps->fft_real_size;
    
    int ncp_alloc = ps->ncp_alloc_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    uint8_t *banlist = ps->banlist + icp;
    
    float *data_x = ps->points + icp;
    float *data_y = data_x + ncp_alloc;

    float *frac_x = ps->points + 4 * ncp_alloc + icp;
    float *frac_y = frac_x + ncp_alloc;

    uint8_t *img = ps->input_buffer;

    float *lsum_temp = (float*)ps->cuda_lsum_temp;
    int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

    if (check_mode) {
	minx = ps->minx;
	maxx = ps->maxx;
	miny = ps->miny;
	maxy = ps->maxy;
    }
    
    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
    cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
    cufftComplex *cache = ps->cuda_fft_cache +  icp * alloc_size;
    float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
    float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;

    int blocks = calc_blocks(size, BLOCK_SIZE_1D);
    int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
    
    int lsum_size = ps->lsum_size;
    int lsum_alloc = ps->lsum_alloc_size;

    cudaStream_t stream[2];
    for (int i = 0; i < 2; ++i) {
	cudaStreamCreate(&stream[i]);
    }

    for (int i = 0;i <= ncp;i++) {
      if (i < ncp) {
	float x = data_x[i] - 1;
	float y = data_y[i] - 1;

	frac_x[i] = x - round(x * precision) / precision;
	frac_y[i] = y - round(y * precision) / precision;
    
	int xstart = roundf(x) - half_size;
	int ystart = roundf(y) - half_size;
    
	int xend = xstart + size;
	int yend = xstart + size;

	if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
	    continue;
	}
	
	if (check_mode) {
	    if (xstart < minx) minx = xstart;
	    if (ystart < miny) miny = ystart;
	    if (xend > maxx) maxx = xend;
	    if (yend > maxy) maxy = yend;
	}

	if (ps->matlab_mode) {
	    cudaMemcpy2D(
		img + i * alloc_size,
		size * sizeof(uint8_t),
		fullimg + (xstart * height + ystart),
		height * sizeof(uint8_t),
		size * sizeof(uint8_t),
		size,
		cudaMemcpyHostToHost
	    );
	} else {
	    cudaMemcpy2D(
		img + i * alloc_size,
	        size * sizeof(uint8_t),
	        fullimg + (ystart * width + xstart),
		width * sizeof(uint8_t),
	        size * sizeof(uint8_t),
		size,
	        cudaMemcpyHostToHost
	    );
	}
	
	cudaMemcpy2DAsync(
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
	    img + i * alloc_size, size * sizeof(uint8_t),
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,
	    stream[i%2]
	);

	banlist[i] = 0;
      }
      if (i > 0) {
        int j = i - 1;
	
	if (ps->base_blocks_power < 0) {
	    vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(
		cuda_input_buffer + j * side_alloc2, side_alloc, 
	        cuda_base_buffer + j*alloc_size, fft_real_size, 
		lsum_temp + lsum_size * (lsum_alloc + 1), 
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
		lsum_alloc,
	        size, blocks
	    );
	} else {
	    vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(
		cuda_input_buffer + j * side_alloc2, side_alloc, 
	        cuda_base_buffer + j*alloc_size, fft_real_size, 
		lsum_temp + lsum_size * (lsum_alloc + 1), 
	        lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1), 
		lsum_alloc,
	        size, ps->base_blocks_power
	    );
	}

	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
	local_sum(ps, 
	    lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
	    lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
	    lsum_temp, lsum_temp + lsum_step,
	    stream[j%2]);

//	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
      }
    }

    for (int j = 0;j < ncp;j++) {
	cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
    }

    for (int i = 0; i < 2; ++i) {
	cudaStreamDestroy(stream[i]);
    }

    if (check_mode) {
	ps->minx = minx;
	ps->maxx = maxx;
	ps->miny = miny;
	ps->maxy = maxy;
    }
    

    return 0;
}


static inline int fftCopyFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {
    int width = ps->width;
    int height = ps->height;

    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;
    int size2 = size * size;
    int ncp_alloc = ps->ncp_alloc_size;

    float *data_x, *data_y;
    if (ps->stored) {
	data_x = ps->res_x + icp;
	data_y = ps->res_y + icp;
    } else {
	data_x = ps->points + 2 * ncp_alloc + icp;
	data_y = data_x + ncp_alloc;
    }

    uint8_t *img = ps->input_buffer;
    uint8_t *banlist = ps->banlist + icp;

    for (int i = 0;i < ncp;i++) {
	float x = data_x[i] - 1;
	float y = data_y[i] - 1;
    
	int xstart = roundf(x) - half_size;
	int ystart = roundf(y) - half_size;
    
	int xend = xstart + size;
	int yend = xstart + size;

	if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
	    banlist[i] = 1;
	    continue;
	}

	if (ps->matlab_mode) {
	    cudaMemcpy2D(
		img + i * size2,//alloc_size,
		size * sizeof(uint8_t),
	    	fullimg + (xstart * height + ystart),
	    	height * sizeof(uint8_t),
		size * sizeof(uint8_t),
		size,
		cudaMemcpyHostToHost
	    );
	} else {
	    cudaMemcpy2D(
		img + i * size2,//alloc_size,
		size * sizeof(uint8_t),
		fullimg + (ystart * width + xstart),
		width * sizeof(uint8_t),
		size * sizeof(uint8_t),
		size,
		cudaMemcpyHostToHost
	    );
	}
    }
    return 0;
}

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {
    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;

    int side_alloc = ps->side_alloc_size;

    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
    uint8_t *img = ps->input_buffer;

/*
    for (int i = 0;i < ncp;i++) {
	cudaMemcpy2D(
	    cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
	    img + i * size2, size * sizeof(uint8_t),
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
	);
    }
*/

    cudaMemcpy3DParms copy_params = { 0 };

    copy_params.dstPtr   = make_cudaPitchedPtr(
	cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
    );
    copy_params.srcPtr   = make_cudaPitchedPtr(
	img, size * sizeof(uint8_t), size, size
    );
    copy_params.extent   = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
    copy_params.kind     = cudaMemcpyHostToDevice;

    cudaMemcpy3DAsync(&copy_params, stream0);
    
    return 0;
}

static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;

    int fft_real_size = ps->fft_real_size;
    
    int ncp_alloc = ps->ncp_alloc_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
    float *cuda_data_buffer = ps->cuda_data_buffer;

    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
    int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
    int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

    float *sumbuf = ps->cuda_points + icp;
    float *stdbuf = ps->cuda_points + ncp_alloc + icp;

    int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

    dim3 stat_grid_dim(side_blocks, cp_blocks, 1);
    stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);
    stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

	// Packing input data for FFT
    dim3 input_grid_dim(input_blocks, cp_blocks, 1);

    if (ps->side_blocks_power < 0) {
        vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(
	    cuda_input_buffer, side_alloc2, side_alloc, 
	    cuda_data_buffer, alloc_size, fft_real_size, 
	    size, side_blocks
	);
    } else {
        vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(
	    cuda_input_buffer, side_alloc2, side_alloc, 
	    cuda_data_buffer, alloc_size, fft_real_size, 
	    size, ps->side_blocks_power
	);
    }
    
    return 0;
}

static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;
    int size2 = size * size;

    int fft_size = ps->fft_size;
    int fft_real_size = ps->fft_real_size;
    
    int ncp_alloc = ps->ncp_alloc_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;

    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
    int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);

    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
    float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

    float *sumbuf = ps->cuda_points + icp;
    float *stdbuf = ps->cuda_points + ncp_alloc + icp;
    
//    Use real size everthere
//    int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);
//    vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(
//	cuda_final_buffer,
//	cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),
//	ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
//	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
//	alloc_size
//    );


    int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;
    dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);

    vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(
	cuda_final_buffer, fft_size,
	cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),
	ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),
	ps->cuda_denom_cache + icp*alloc_size, stdbuf,
	alloc_size, fft_blocks
    );
	

	// Looking for maximum
    float *xbuf = sumbuf;
    float *ybuf = stdbuf;

    int32_t *posbuf = (int*)ps->cuda_temp_buffer;
    float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

    dim3 result_grid_dim(fft_blocks, cp_blocks, 1);

//    Use real size everthere
//    find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);
//    find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);

    find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);
    find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);
    
    return 0;
}

static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {
    int fft_real_size = ps->fft_real_size;

    int alloc_size = ps->fft_alloc_size;

    uint8_t *banlist = ps->banlist + icp;
    float *cuda_data_buffer = ps->cuda_data_buffer;

    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

	// Performing FFT's
    cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

    cufftSetStream(ps->cufft_r2c_plan, stream0);
    cufftSetStream(ps->cufft_c2r_plan, stream0);
    
    for (int i = 0;i < ncp;i++) {
	if (banlist[i]) continue;
	cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);
    }

    int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);
    dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);
    vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);

        // First in-place transform for some reason is failing, therefore we
	// have one alloc_size spacing between starts (see cuda_fft_buffer set above)
    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
    for (int i = 0;i < ncp;i++) {
	if (banlist[i]) continue;
	cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size,  cuda_result_buffer + i * alloc_size);
    }

    return 0;
}


static inline int fftGetCurrentPoints(DICTContext ps) {
    int ncp = ps->ncp;
    int ncp_alloc = ps->ncp_alloc_size;
    int precision = ps->precision;

    float *move_x, *move_y;

	// We do not do a completely correct thing in non-matlab mode, the data
	// is copied from image buffer non-transposed as it should be, but 
	// the processing code is supports only matlab-mode and handles it as
	// standard transposed data. Therefore, here we turning back the
	// X and Y coords. But this adds some extra precision penalty.
	// Therefore, it is better to use matlab mode until the computation 
	// code is changed (this implementation is just done to accept 
	// images from user apps without transposing)
    if (ps->matlab_mode) {
	move_x = ps->points + 6 * ncp_alloc;
	move_y = move_x + ncp_alloc;

	cudaMemcpy2D(
    	    move_x, ncp_alloc * sizeof(float),
	    ps->cuda_points, ncp_alloc * sizeof(float),
	    ps->ncp * sizeof(float), 2,
    	    cudaMemcpyDeviceToHost
	);
    } else {
	move_y = ps->points + 6 * ncp_alloc;
	move_x = move_y + ncp_alloc;

	cudaMemcpy2D(
    	    move_y, ncp_alloc * sizeof(float),
	    ps->cuda_points, ncp_alloc * sizeof(float),
	    ps->ncp * sizeof(float), 2,
    	    cudaMemcpyDeviceToHost
	);
    }

    float *data_x, *data_y;
    if (ps->stored) {
        data_x = ps->res_x;
        data_y = ps->res_y;
    } else {
        data_x = ps->points + 2 * ncp_alloc;
        data_y = data_x + ncp_alloc;
    }

    float *res_x, *res_y;
    if ((ps->res_x)&&(ps->res_y)) {
	res_x = ps->res_x;
	res_y = ps->res_y;
	
	ps->stored = 1;
    } else {
	res_x = data_x;
	res_y = data_y;
    }

    float frac;
    float *frac_x = ps->points + 4 * ncp_alloc;
    float *frac_y = frac_x + ncp_alloc;
    uint8_t *banlist = ps->banlist;

    for (int i = 0;i < ncp;i++) {
        if (banlist[i]) {
            res_x[i] = data_x[i];
            res_y[i] = data_y[i];
            continue;
        }

        frac = data_x[i] - round(data_x[i]*precision)/precision;
        res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

        frac = data_y[i] - round(data_y[i]*precision)/precision;
        res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);
    }

    return 0;
}