/normxcorr/trunk : contents of cuda/normxcorr_hw

: (revision 13)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

static __global__ void vecMul(cuComplex *a, cuComplex *b, int pitch, int size) {
    float tmp;

    int point = blockIdx.y * blockDim.y + threadIdx.y;
    int i = threadIdx.x + blockIdx.x * blockDim.x + point*pitch;
    
    tmp = a[i].x * b[i].x - a[i].y * b[i].y;
    a[i].y = a[i].x * b[i].y + a[i].y * b[i].x; 
    a[i].x = tmp;
}

static __global__ void vecPack(uint8_t *b, int bpitch, int bsize, cufftReal *a, int apitch, int asize, int size, int blocks_size) {
	// Includes rotation on 180 grad
    
    int point = blockIdx.y * blockDim.y + threadIdx.y;

    //__fdiv_rz(pos / bsize)
    int y = __float2int_rz(__fdividef(blockIdx.x, blocks_size));
    int x = (blockIdx.x - y * blocks_size) * blockDim.x + threadIdx.x ;
    
/*
    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    int y = pos / (blocks_size * blockDim.x);
    int x = pos - (y * blocks_size * blockDim.x);
*/

    if ((x < size)&&(y < size)) {
        int i = (size - y - 1)*bsize + size - x - 1;
	a[point * apitch + y * asize + x] = b[point * bpitch + i];
    }

}

static __global__ void vecPackFast(uint8_t *b, int bpitch, int bsize, cufftReal *a, int apitch, int asize, int size, int blocks_shift) {
    __shared__ float data[CP_BLOCK_SIZE][SIDE_BLOCK_SIZE + 1];

    int point = blockIdx.y * blockDim.y + threadIdx.y;

    int y = blockIdx.x>>blocks_shift;
    int bx = (blockIdx.x - (y<<blocks_shift)) * blockDim.x ;
    int x = bx + threadIdx.x;

//    int x = (blockIdx.x - (y<<blocks_shift)) * blockDim.x + threadIdx.x ;

	// threadIdx.x depends only on x
    data[threadIdx.y][threadIdx.x] = b[point * bpitch + y * bsize + x];
    
    __syncthreads();

    int pos  = size - bx - blockDim.x + threadIdx.x;

    if ((pos>=0)&&(y < size)) {
	a[point * apitch + (size - y - 1) * asize + pos] = 
	    data[threadIdx.y][blockDim.x - threadIdx.x - 1];
    }

//    if ((x < size)&&(y < size)) {
//        int i = (size - y - 1)*bsize + size - x - 1;
//	a[point * apitch + y * asize + x] = b[point * bpitch + i];
//    }

}


static __global__ void vecBasePack(
    uint8_t *b, int bsize,
    cufftReal *a, int asize, 
    float *c, float *c2, int csize,
    int size, int blocks_size) {

    int y = __float2int_rz(__fdividef(blockIdx.x, blocks_size));
    int x = (blockIdx.x - y * blocks_size) * blockDim.x + threadIdx.x ;

    if ((x<size)&&(y<size)) {
	float v = b[x + y*bsize];

	a[x + y*asize] = v;
	
	int i = x + y*csize;
	c[i] = v;
	c2[i] = v * v;
    }
}

static __global__ void vecBasePackFast(
    uint8_t *b, int bsize,
    cufftReal *a, int asize, 
    float *c, float *c2, int csize,
    int size, int blocks_shift) {

    int y = blockIdx.x>>blocks_shift;
    int x = (blockIdx.x - (y<<blocks_shift)) * blockDim.x + threadIdx.x ;
    
    if ((x<size)&&(y<size)) {
	float v = b[x + y*bsize];
	a[x + y*asize] = v;

	int i = x + y*csize;
	c[i] = v;
	c2[i] = v * v;
    }
}


static __global__ void stat1(int32_t *buf1, int32_t *buf2, uint8_t *img, int image_pitch, int row_pitch, int size) {
    int i;
    int end = size * row_pitch;

    int side_idx =  blockIdx.x * blockDim.x + threadIdx.x;
    int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

    int32_t sum = 0;
    int32_t sum2 = 0;

    uint8_t *vec = img + img_idx * image_pitch + side_idx;

    for (i = 0; i < end; i+=row_pitch) {
	int32_t val = vec[i];
	sum += val;
	sum2 += val*val;
    }

    buf1[side_idx * CP_BLOCK + img_idx] = sum;
    buf2[side_idx * CP_BLOCK + img_idx] = sum2;
}

static __global__ void stat2(float *res1, float *res2, int32_t *buf1, int32_t *buf2, int size) {
    int i;
    int end = size * CP_BLOCK;
    int img_idx =  blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    int sum2 = 0;

    int32_t *vec1 = buf1 + img_idx;
    int32_t *vec2 = buf2 + img_idx;

    for (i = 0; i < end; i+=CP_BLOCK) {
	sum += vec1[i];
	sum2 += vec2[i];
    }

    res1[img_idx] = sum;

    float cnt = size * size;
    float mean = ((float)sum) / cnt;
    
    res2[img_idx] = sqrtf(fmaxf(((float)sum2) / cnt - mean*mean,0));
}

static __global__ void vecCompute(
    float *res,
    cufftReal *corr, float corr_scale, 
    float *lsum, float *lsum_scale_ptr, float lsum_mult,
    float *denom, float *denom_scale_ptr,
    int pitch, int size
) {
//    int pos = threadIdx.x + blockIdx.x*size;

    int point = blockIdx.y * blockDim.y + threadIdx.y;
    int pos = threadIdx.x + blockIdx.x * blockDim.x + point * pitch;


    float lsum_scale = lsum_scale_ptr[point] * lsum_mult;
    float denom_scale = denom_scale_ptr[point];

    if (denom[pos]&&denom_scale) {
	res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);
    }
}

static __global__ void find_max1(float *buf1, int32_t *buf2, float *corr, int image_pitch, int row_pitch, int size) {
    int i;
    int end = size * row_pitch;

    int side_idx =  blockIdx.x * blockDim.x + threadIdx.x;
    int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

    float max = 0.5;	// This is limit for acceptance in cpcorr
    int32_t pos = 0;

    float *vec = corr + img_idx * image_pitch + side_idx;

    for (i = 0; i < end; i+=row_pitch) {
	float val = vec[i];
	if (val > max) {
	    max = val;
	    pos = i;
	}
    }
    
	// align to remove if
    if (side_idx < size) {
	buf1[side_idx * CP_BLOCK + img_idx] = max;
	buf2[side_idx * CP_BLOCK + img_idx] = pos / row_pitch;
    }
}

static __global__ void find_max2(
    float *res1, float *res2, float *buf1, int32_t *buf2, 
    float *corr, int image_pitch, int row_pitch,
    int size, float center, float limit
) {
    int i, j;
    int end = size * CP_BLOCK;
    int img_idx =  blockIdx.x * blockDim.x + threadIdx.x;

    float max = 0.5;	// This is limit for acceptance in cpcorr
    int32_t xpos = 0;
    int32_t ypos = 0;

    float *maxes = buf1 + img_idx;
    int32_t *poses = buf2 + img_idx;

    /* 
	This is a magic number which are used to reimplement position fitting
	using 9 neighbouring points (see findpeak.m). Thats array is a
	Moore-Penrose pseudoinverse (pinv) of matrix "X":
	    x = [-1 -1 -1  0  0  0  1  1  1]';
	    y = [-1  0  1 -1  0  1 -1  0  1]';
	    X = [ones(9,1),  x,  y,  x.*y,  x.^2,  y.^2];
	This matrix is, then, used to compute
	    A = X\u
	by formula
	    A = pinv(X)*u
    */
	
    float magic[54] = {
	-1.111111111111111e-01,     2.222222222222223e-01,    -1.111111111111110e-01,     2.222222222222222e-01,  5.555555555555555e-01,     2.222222222222222e-01,    -1.111111111111112e-01,     2.222222222222222e-01, -1.111111111111111e-01,
	-1.666666666666667e-01,    -1.666666666666667e-01,    -1.666666666666669e-01,     4.625993595943901e-17,  5.251763038925035e-17,    -7.864015431086855e-17,     1.666666666666668e-01,     1.666666666666668e-01,  1.666666666666667e-01,
	-1.666666666666667e-01,     2.989343524323885e-17,     1.666666666666668e-01,    -1.666666666666668e-01, -1.466129894633581e-18,     1.666666666666668e-01,    -1.666666666666668e-01,    -2.561771598799484e-17,  1.666666666666667e-01,
	 2.499999999999986e-01,    -3.087055673664417e-16,    -2.499999999999991e-01,    -1.189598686604080e-15,  1.394017483632152e-17,     1.214208755585016e-15,    -2.500000000000011e-01,     3.049745671257349e-16,  2.500000000000016e-01,
	 1.666666666666664e-01,     1.666666666666666e-01,     1.666666666666667e-01,    -3.333333333333334e-01, -3.333333333333330e-01,    -3.333333333333331e-01,     1.666666666666665e-01,     1.666666666666666e-01,  1.666666666666667e-01,
	 1.666666666666667e-01,    -3.333333333333335e-01,     1.666666666666664e-01,     1.666666666666669e-01, -3.333333333333332e-01,     1.666666666666667e-01,     1.666666666666668e-01,    -3.333333333333333e-01,  1.666666666666665e-01
     };
     

    for (i = 0; i < end; i+=CP_BLOCK) {
	float val = maxes[i];
	if (val > max) {
	    max = val;
	    ypos = i;
	    xpos = poses[i];
	}
    }

    ypos /= CP_BLOCK;
    
    if ((max > 0.5f)&&((fabsf(xpos - center) < limit)&&(fabsf(ypos - center) < limit))) {

	    // Limit warranties we are not at the edge
	float x_offset;
	float y_offset;
	
	float *vec0 = corr + img_idx * image_pitch + (xpos - 1)*row_pitch + (ypos - 1);
	float *vec1 = corr + img_idx * image_pitch + (xpos    )*row_pitch + (ypos - 1);
	float *vec2 = corr + img_idx * image_pitch + (xpos + 1)*row_pitch + (ypos - 1);
    
	float neighbors[9] = {
	    vec0[0], vec0[1], vec0[2],	
	    vec1[0], max    , vec1[2],
	    vec2[0], vec2[1], vec2[2]
	};

	float A[6];
	for (i=0; i<6; i++) {
	    A[i] = 0;
	    for (j=0; j<9; j++) {
		A[i] += magic[i*9 + j] * neighbors[j];
	    }
	}

	x_offset = (-A[2]*A[3]+2*A[5]*A[1]) / (A[3]*A[3]-4*A[4]*A[5]);
	y_offset = -1.f / ( A[3]*A[3]-4*A[4]*A[5])*(A[3]*A[1]-2*A[4]*A[2]);

	if ((fabsf(x_offset)>1.f)||(fabsf(y_offset)>1.f)) {
	    x_offset = 0;
	    y_offset = 0;
	} else {
	    x_offset = roundf(10*x_offset)/10;
	    y_offset = roundf(10*y_offset)/10;
	}
	res1[img_idx] = ((float)xpos) + x_offset + 1 - center;
	res2[img_idx] = ((float)ypos) + y_offset + 1 - center;
    } else {
	    // DS: Still fractional offsets are computed in this case, shall we ignore that?
	res1[img_idx] = 0;
	res2[img_idx] = 0;
    }
}