/normxcorr/trunk : contents of cuda/normxcorr

: (revision 7)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

#include <stdio.h>
#include <stdlib.h>

#include <mex.h>

#include "normxcorr_hw.h"
#include "local_sum.h"

#include "normxcorr_hw_msg.h"
#include "normxcorr_hw_kernel.cu"

#define max3(a,b,c) max2(max2(a,b),c)
#define max2(a,b) (((a)>(b))?(a):(b))
#define min2(a,b) (((a)<(b))?(a):(b))

#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))

static TProcessingState *pstate = NULL;

static void fftFree(TProcessingState *ps) {
    if (ps->cuda_base_buffer) {
	cudaFree(ps->cuda_lsum_temp);
	
	cudaFree(ps->cuda_lsum_buffer);
	cudaFree(ps->cuda_denom_buffer);
    
	cudaFree(ps->cuda_temp_buffer);
	cudaFree(ps->cuda_final_buffer);
	cudaFree(ps->cuda_result_buffer);
	cudaFree(ps->cuda_data_buffer);
	cudaFree(ps->cuda_base_buffer);
	cudaFree(ps->cuda_input_buffer);
	cudaFreeHost(ps->input_buffer);
	
	cudaFree(ps->cuda_cp);

	cudaFreeHost(ps->data_x);
	cudaFreeHost(ps->data_y);
	
	ps->cuda_base_buffer = NULL;
    }

	// DS: Source of bug, that occasionaly can corrupt something ...
    if (ps->cudpp_initialized) {
	cudppDestroyPlan(ps->cudpp_plan);
	ps->cudpp_initialized = false;
    }

    if (ps->fft_initialized) {
	cufftDestroy(ps->cufft_r2c_plan);
	cufftDestroy(ps->cufft_c2r_plan);
//	cufftDestroy(ps->cufft_plan);
//	cublasShutdown();
	ps->fft_initialized = false;
    }

}

#include <unistd.h>
static int fftInit(TProcessingState *ps) {
    CUDPPConfiguration cudpp_config;
    
    CUDPPResult cudpp_err;
    cufftResult cufft_err;
    cudaError cuda_err;

    int size;
    int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;
    int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;
    
    fftFree(ps);

//    cublasInit();
//    cufftPlan2d(&ps->cufft_plan, ps->fft_size, ps->fft_size, CUFFT_C2C);

    cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);
    if (cufft_err) {
	reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);
	return ERROR_CUFFT;
    }	
    
    cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_size, ps->fft_size, CUFFT_C2R);
    if (cufft_err) {
	reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);
	cufftDestroy(ps->cufft_r2c_plan);
	return ERROR_CUFFT;
    }

    ps->fft_initialized = true;

    cudpp_config.algorithm = CUDPP_SCAN;
    cudpp_config.options = CUDPP_OPTION_FORWARD |  CUDPP_OPTION_INCLUSIVE;
    cudpp_config.op = CUDPP_ADD;
    cudpp_config.datatype = CUDPP_FLOAT;

    cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);
    if (cudpp_err != CUDPP_SUCCESS) {
	reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);
	fftFree(ps);
	return ERROR_CUDPP;
    }
    
    ps->cudpp_initialized = true;

    cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_base_buffer is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }


    size = max3(
	ps->fft_alloc_size * sizeof(cufftComplex),				/* FFT multiplication */
	2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t),			/* Sum, Std computations */
	CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float))	/* Max of correlation */
    );

    cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, size);
    if (cuda_err) {
	reportError("Device memory allocation of %u*cufftComplex bytes for cuda_data_buffer is failed", ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaHostAlloc((void**)&ps->data_x, ps->ncp * sizeof(float), 0);
    if (!cuda_err) cuda_err = cudaHostAlloc((void**)&ps->data_y, ps->ncp * sizeof(float), 0);
    if (cuda_err) {
	reportError("Host memory allocation of 2*%u*float bytes for control points is failed", ps->ncp);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * calc_alloc(ps->ncp, CP_BLOCK) * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * side_alloc_size2 * sizeof(uint8_t));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaHostAlloc((void**)&ps->input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
    if (cuda_err) {
	reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_final_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_final_buffer is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_final_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(float));

    cuda_err = cudaMalloc((void**)&ps->cuda_result_buffer, ps->ncp*ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*cufftReal bytes for cuda_result_buffer is failed", ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, ps->fft_alloc_size * sizeof(cufftReal));
    if (cuda_err) {
	reportError("Device memory allocation of %u*cufftReal bytes for cuda_temp_buffer is failed", ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }
    cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_buffer is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_buffer is failed", ps->ncp, ps->fft_alloc_size);
	fftFree(ps);
	return ERROR_CUDA_MALLOC;
    }

    cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2  * sizeof(float));
    if (cuda_err) {
	reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);
	fftFree(ps);
	return ERROR_MALLOC;
    }
	// We need to zero temporary buffers as well, since we are not computing
	// cumsum of complete matrix, but non-zero part of it
    cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));

    
    return 0;
}

static void fftPrepare(TProcessingState *ps) {
    if (ps->fft_initialized) {
	    // Since template and current image have different neighbourhoud sizes
	cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));
    }
}



void pstateFree(TProcessingState *ps) {
    if (ps) {
	fftFree(ps);
	free(ps);
    }
}

TProcessingState *pstateInit() {
    TProcessingState *ps;
    
    ps = (TProcessingState*)malloc(sizeof(TProcessingState));
    if (ps) {
	ps->ncp = 0;
	ps->cuda_base_buffer = NULL;
	ps->fft_initialized = false;
	ps->cudpp_initialized = false;
    }

    return ps;
}


static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {
    uint8_t *dataPtr;

    if (!ps->fft_initialized) {
	reportError("cuFFT engine is not initialized yet");
	return NULL;
    }
    
    int size = ps->fft_size;
    int alloc_size = ps->fft_alloc_size;

    int N = mxGetM(data);
    int N2 = N * N;

    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    dim3 input_block_dim(N, 1, 1);
    dim3 input_grid_dim(N, 1, 1);

    uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;
    cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;
    cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

    dataPtr = (uint8_t*)mxGetData(data);
    cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

    float *lsum_temp = ps->cuda_lsum_temp;
    int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

    cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));
    cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

    vecPackBase<<<input_grid_dim, input_block_dim>>>(
	cudaInputPtr, N, 
	cudaRealPtr, size, 
	lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size
    );

	// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
    local_sum(ps, 
	ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,
	lsum_temp + (2 * step), lsum_temp + (3 * step),
	lsum_temp, lsum_temp + step);

    cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

    return cudaPtr;
}


static inline void fftGetPoints(TProcessingState *ps, mxArray *result) {
    cudaMemcpy2D(
	mxGetData(result), ps->ncp * sizeof(float),
	ps->cuda_cp, ps->ncp_alloc_size * sizeof(float),
	ps->ncp * sizeof(float), 2,
	cudaMemcpyDeviceToHost
    );
}

/*
static inline mxArray *fftSetPoints(TProcessingState *ps, mxArray *result) {
}
*/

static int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {
    int width = mxGetN(image);
    int height = mxGetM(image);

    int half_size = ps->corr_size;
    int size = 2 * half_size + 1;
    int size2 = size * size;

    int fft_size = ps->fft_size;
    int fft_size2 = fft_size * fft_size;
    int alloc_size = ps->fft_alloc_size;
    int side_alloc = ps->side_alloc_size;
    int side_alloc2 = side_alloc * side_alloc;

    uint8_t *fullimg = ((uint8_t*)mxGetData(image));
    uint8_t *img = ps->input_buffer;

    cufftComplex *cudaDataPtr = (cufftComplex*)ps->cuda_data_buffer;
    cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

    dim3 input_block_dim(size, 1, 1);
    dim3 input_grid_dim(size, 1, 1);
    dim3 block_dim(fft_size / 2 + 1, 1, 1);
    dim3 grid_dim(fft_size, 1, 1);

    for (int i = 0;i < ncp;i++) {
	float x = ps->data_x[i+icp] - 1;
	float y = ps->data_y[i+icp] - 1;
    
	int xstart = roundf(x) - half_size;
	int ystart = roundf(y) - half_size;
    
	int xend = xstart + size;
	int yend = xstart + size;

	if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
		// Somehow mark we have skipped it
	    continue;
	}

	cudaMemcpy2D(
	    img,
	    size * sizeof(uint8_t),
	    fullimg + (xstart * height + ystart),
	    height * sizeof(uint8_t),
	    size * sizeof(uint8_t),
	    size,
	    cudaMemcpyHostToHost
	);


        cufftComplex *cudaBasePtr = ps->cuda_base_buffer + (i+icp) * alloc_size;
	cufftReal *cudaResultPtr = ps->cuda_result_buffer + (i+icp) * alloc_size;
        
	uint8_t *cudaInputPtr = ps->cuda_input_buffer + i*side_alloc2;

	cudaMemcpy2D(
	    cudaInputPtr, side_alloc * sizeof(uint8_t),
	    img, size * sizeof(uint8_t),
	    size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
	);

	vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, fft_size, cudaInputPtr, side_alloc, size);

	cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);

	vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaBasePtr, fft_size/2+1);

	cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaResultPtr);
    }


    int cp_blocks1, cp_blocks, side_blocks;
    if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;
    else cp_blocks = ncp / CP_BLOCK_SIZE;

    if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;
    else side_blocks = size / SIDE_BLOCK_SIZE;

    int32_t *stat_buf = (int*)ps->cuda_data_buffer;

    float *sumbuf = ps->cuda_cp + icp;
    float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;

    dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
    dim3 joint_grid_dim(side_blocks, cp_blocks, 1);
    
    stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

    if (ncp%BLOCK_SIZE_1D) cp_blocks1 = (ncp / BLOCK_SIZE_1D) + 1;
    else cp_blocks1 = ncp / BLOCK_SIZE_1D;

    stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);


    dim3 output_block_dim(fft_size, 1, 1);
    dim3 output_grid_dim(fft_size, 1, 1);

    for (int i = 0;i < ncp;i++) {
	float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;
        float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;
	cufftReal *cudaRealPtr = ps->cuda_result_buffer + (i+icp)*alloc_size;
	float *cudaResultPtr = ps->cuda_final_buffer + (i+icp)*alloc_size;

	vecCompute<<<output_grid_dim, output_block_dim>>>(
	    cudaResultPtr,
	    cudaRealPtr, 1./(fft_size2 * (size2 - 1)),
	    cudaLSum, sumbuf+i, 1. / (size2 * (size2 - 1)),
	    cudaDenom, stdbuf+i,
	    fft_size
	);
    }


    float *xbuf = sumbuf;
    float *ybuf = stdbuf;

    int32_t *posbuf = (int*)ps->cuda_data_buffer;
    float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

    int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);

    dim3 result_block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
    dim3 result_grid_dim(fft_blocks, cp_blocks, 1);
    find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);
    find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1,  ps->corr_size - 1);


    return 0;
}

#ifdef VALIDATE_PEAK
static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {
    int size = ps->fft_size;
    int size2 = size * size;
    int alloc_size = ps->fft_alloc_size;
    float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

    fftLoadFragment(ps, icp, 1, image);

    mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);
    float *ar = (float*)mxGetPr(res);

    cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

    return res;
}
#endif /* VALIDATE_PEAK */

/*
static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {
  cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);
}    
*/


static void selfClean() {
    if (pstate) {
	reportMessage("Self-cleaning normxcorr_hw instance");

	pstateFree(pstate);
	pstate = NULL;
    }
}


void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    int err;
    int64_t *errPtr;
    
    int deviceCount;
    cudaDeviceProp deviceProp;
    
    mxArray *idMatrix;
    int32_t id, *idPtr;

    TProcessingState *ps;

    TAction action;

    int iprop;
    
    const mxArray *input;
    const mxArray *base;

#ifdef VALIDATE_LSUM
    const mxArray *lsum;
    const mxArray *denom;
    const mxArray *nonzero;
#endif /* VALIDATE_LSUM */

    const mxArray *x, *y;
    mxArray *points;

    unsigned int icp;

    if (!nrhs) {
	reportMessage("Initializing normxcorr_hw instance");

	if (nlhs != 1) {
	    reportError("You should accept a single result from initialization call");
	    return;
	}
	
	if (pstate) {
	    reportError("Only a single calculation process is supported at the moment");
	    return;
	}

	// Initialising, for now a single client is supported only

	idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);
	if (!idMatrix) {
	    reportError("Initialization is failed");
	    return;
	}


	// Detecting cuda devices

	cudaGetDeviceCount(&deviceCount);
	if (deviceCount) {
	    cudaGetDeviceProperties(&deviceProp, 0);
	    if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {
		id = 1;
	    } else { // Hardware capabilities are bellow 1.3
		id = 0;
	    }
	} else { // No cuda device, using software
	    id = -1;
	}

	
	if (id > 0) {
	    pstate = pstateInit();
	    if (!pstate) {
		mxDestroyArray(idMatrix);
	        reportError("State structure initialization is failed");
	        return;
	    }
	} else {
	    pstate = NULL;
	}


	idPtr = (int32_t*)mxGetData(idMatrix);
	idPtr[0] = id;
	
	plhs[0] = idMatrix;
	
	mexAtExit(selfClean);

	return;
    } else {
/*
	idMatrix = (mxArray*)prhs[0];
	if ((mxGetClassID(idMatrix) != mxINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {
	    reportError("Invalid parameter is supplied in place of process identificator");
	    return;
	}

	idPtr = (int32_t*)mxGetData(idMatrix);
	if (!idPtr) {
	    reportError("Mex is not able to obtain process identificator");
	    return;
	}
	
	id = *idPtr;
	if (id != 1) {
	    reportError("Invalid process identificator is supplied");
	    return;
	}

        if (!pstate) {
	    reportError("The interface is not initialized");
	    return;
	}
*/
    }

	// Clean request
    if (nrhs == 1) {
	reportMessage("Cleaning normxcorr_hw instance");

	pstateFree(pstate);
	pstate = NULL;

	return;
    }


    ps = pstate;
    
    action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));

//    reportMessage("Executing normxcorr_hw action: %u", action);

    switch (action) {
#ifdef VALIDATE_PEAK
     case ACTION_COMPUTE_FRAGMENT:
	icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
	plhs[0] = fftCompute(ps, icp, prhs[3]);
     break;
#endif /* VALIDATE_PEAK */
     case ACTION_COMPUTE:
	if (nrhs != 3) {
	    reportError("This action expects 1 argument, but %i is passed", nrhs - 2);
	    return;
        }

	input = prhs[2];
	
	if (mxGetClassID(input) != mxUINT8_CLASS) {
	    reportError("Invalid type of image data, should be 8bit integers");
	    return;
	}
	
	for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {
	    err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);
	}
	
     break;
     case ACTION_COMPUTE_BASE:
	if ((nrhs != 4)
#ifdef VALIDATE_LSUM
	    &&(nrhs != 7)
#endif /* VALIDATE_LSUM */
	) {
	    reportError("This action expects 2 arguments, but %i is passed", nrhs - 2);
	    return;
        }

	icp = (unsigned int)mxGetScalar(prhs[2]) - 1;
	if (icp >= ps->ncp) {
	    reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);
	    return;
	}

	base = prhs[3];
    
	if (mxGetNumberOfDimensions(base) != 2) {
	    reportError("Invalid dimensionality of base matrix, 2D matrix is expected");
	    return;
	}

	if (mxGetClassID(base) != mxUINT8_CLASS) {
	    reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));
	    return;
	}

	iprop = ps->fft_size;

#ifdef VALIDATE_LSUM
	if (nrhs == 7) {
	    lsum = prhs[4];
	    denom = prhs[5];
	    nonzero = prhs[6];
	    if (
		(mxGetNumberOfDimensions(lsum) != 2)||
		(mxGetNumberOfDimensions(denom) != 2)||
		(mxGetClassID(lsum) != mxSINGLE_CLASS)||
		(mxGetClassID(denom) != mxSINGLE_CLASS)||
		(mxGetClassID(nonzero) != mxUINT16_CLASS)||
		(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||
		(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)
	    
	    ) {
		reportError("Invalid properties for base initialization are specified");
		return;
	    }
	} else {
	    lsum = NULL;
	    denom = NULL;
	    nonzero = NULL;
	}
#endif /* VALIDATE_LSUM */
	
	fftUploadBaseData(ps, icp, base);

#ifdef VALIDATE_LSUM
	local_sum_validate(ps, icp, lsum, denom);
#endif /* VALIDATE_LSUM */

     break;
     case ACTION_SET_POINTS:
	if (nrhs != 4) {
	    reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");
	    return;
	}

        x = prhs[2];
	y = prhs[3];
	
	if (	(mxGetClassID(x) != mxSINGLE_CLASS)||
		(mxGetClassID(y) != mxSINGLE_CLASS)||
		(mxGetN(x)*mxGetM(x) != ps->ncp)||
		(mxGetN(y)*mxGetM(y) != ps->ncp)
	) {
	    reportError("Invalid control points are specified");
	    return;
	}
	
	memcpy(ps->data_x, mxGetData(x), ps->ncp * sizeof(float));
	memcpy(ps->data_y, mxGetData(y), ps->ncp * sizeof(float));
     break;
     case ACTION_GET_POINTS:
        if (nrhs != 2) {
	    reportError("GetPoints action do not expect any arguments");
	    return;
	}
        if (nlhs != 1) {
	    reportError("GetPoints action returns a single matrix");
	    return;
	}
	
	points = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);
	fftGetPoints(ps, points);
	plhs[0] = points;
     break;     
     case ACTION_SETUP:
	if (nrhs != 4) {
	    reportError("SETUP action expects 'ncp' and 'corrsize' parameters");
	    return;
	}

	ps->ncp = (int)mxGetScalar(prhs[2]);

	iprop = (int)mxGetScalar(prhs[3]);
	ps->corr_size = iprop;
	ps->fft_size = 6 * iprop + 1;
	ps->subimage_size = ps->corr_size * 4 + 1;

	ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);
	ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);
	ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

	ps->lsum_size = ps->corr_size * 2 + 1;
	ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;
    
	ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);
	ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);
	ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);
	
	err = fftInit(ps);

	if (nlhs == 1) {
	    idMatrix = mxCreateNumericMatrix(1, 1, mxINT64_CLASS, mxREAL);
	    if (idMatrix) {
		errPtr = (int64_t*)mxGetData(idMatrix);
		errPtr[0] = err;
		plhs[0] = idMatrix;
	    } else {
		reportError("Initialization of result matrix is failed");
	        return;
	    }
	}
     break;
     case ACTION_PREPARE:
        fftPrepare(ps);
     break;
     default:
        reportError("Unknown request %i", action);
    }
    
}