/normxcorr/trunk : contents of cuda/normxcorr_hw

: (revision 8)

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

static __global__ void vecMul(cuComplex *a, cuComplex *b, int size) {
    float tmp;
    
    int i = threadIdx.x + blockIdx.x*size;
    
    tmp = a[i].x * b[i].x - a[i].y * b[i].y;
    a[i].y = a[i].x * b[i].y + a[i].y * b[i].x; 
    a[i].x = tmp;
}


static __global__ void vecPack(uint8_t *b, int bsize, cufftReal *a, int asize, int size) {
//    int i = threadIdx.x + blockIdx.x*bsize;
//    int i = bsize - threadIdx.x - 1 + (bsize - blockIdx.x - 1)*bsize;

	// Invalid memory access (possibly)
//    int i = (bsize - blockIdx.x)*bsize - threadIdx.x - 1;
    int i = size - threadIdx.x - 1 + (size - blockIdx.x - 1)*bsize;
    a[threadIdx.x + asize*blockIdx.x] = b[i];
}

static __global__ void vecPackBase(
    uint8_t *b, int bsize,
    cufftReal *a, int asize, 
    float *c, float *c2, int csize, int coffset) {
    
    float v;
    int i = threadIdx.x + (blockIdx.x+coffset)*csize + coffset;
    
    v = b[threadIdx.x + blockIdx.x*bsize];

    a[threadIdx.x + asize*blockIdx.x] = v;
    
    c[i] = v;
    c2[i] = v * v;
}


static __global__ void stat1(int32_t *buf1, int32_t *buf2, uint8_t *img, int image_pitch, int row_pitch, int size) {
    int i;
    int end = size * row_pitch;

    int side_idx =  blockIdx.x * blockDim.x + threadIdx.x;
    int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

    int32_t sum = 0;
    int32_t sum2 = 0;

    uint8_t *vec = img + img_idx * image_pitch + side_idx;

    for (i = 0; i < end; i+=row_pitch) {
	int32_t val = vec[i];
	sum += val;
	sum2 += val*val;
    }

    buf1[side_idx * CP_BLOCK + img_idx] = sum;
    buf2[side_idx * CP_BLOCK + img_idx] = sum2;
}

static __global__ void stat2(float *res1, float *res2, int32_t *buf1, int32_t *buf2, int size) {
    int i;
    int end = size * CP_BLOCK;
    int img_idx =  blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0;
    int sum2 = 0;

    int32_t *vec1 = buf1 + img_idx;
    int32_t *vec2 = buf2 + img_idx;

    for (i = 0; i < end; i+=CP_BLOCK) {
	sum += vec1[i];
	sum2 += vec2[i];
    }

    res1[img_idx] = sum;

    float cnt = size * size;
    float mean = ((float)sum) / cnt;
    
    res2[img_idx] = sqrtf(fmaxf(((float)sum2) / cnt - mean*mean,0));
}

static __global__ void vecCompute(
    float *res,
    cufftReal *corr, float corr_scale, 
    float *lsum, float *lsum_scale_ptr, float lsum_mult,
    float *denom, float *denom_scale_ptr,
    int size
) {
    int pos = threadIdx.x + blockIdx.x*size;

    float lsum_scale = (*lsum_scale_ptr) * lsum_mult;
    float denom_scale = (*denom_scale_ptr);

    if (denom[pos]&&denom_scale) {
	res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);
    }
}

static __global__ void find_max1(float *buf1, int32_t *buf2, float *corr, int image_pitch, int row_pitch, int size) {
    int i;
    int end = size * row_pitch;

    int side_idx =  blockIdx.x * blockDim.x + threadIdx.x;
    int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

    float max = 0.5;	// This is limit for acceptance in cpcorr
    int32_t pos = 0;

    float *vec = corr + img_idx * image_pitch + side_idx;

    for (i = 0; i < end; i+=row_pitch) {
	float val = vec[i];
	if (val > max) {
	    max = val;
	    pos = i;
	}
    }
    
	// align to remove if
    if (side_idx < size) {
	buf1[side_idx * CP_BLOCK + img_idx] = max;
	buf2[side_idx * CP_BLOCK + img_idx] = pos / row_pitch;
    }
}

static __global__ void find_max2(
    float *res1, float *res2, float *buf1, int32_t *buf2, 
    float *corr, int image_pitch, int row_pitch,
    int size, float center, float limit
) {
    int i, j;
    int end = size * CP_BLOCK;
    int img_idx =  blockIdx.x * blockDim.x + threadIdx.x;

    float max = 0.5;	// This is limit for acceptance in cpcorr
    int32_t xpos = 0;
    int32_t ypos = 0;

    float *maxes = buf1 + img_idx;
    int32_t *poses = buf2 + img_idx;

    /* 
	This is a magic number which are used to reimplement position fitting
	using 9 neighbouring points (see findpeak.m). Thats array is a
	Moore-Penrose pseudoinverse (pinv) of matrix "X":
	    x = [-1 -1 -1  0  0  0  1  1  1]';
	    y = [-1  0  1 -1  0  1 -1  0  1]';
	    X = [ones(9,1),  x,  y,  x.*y,  x.^2,  y.^2];
	This matrix is, then, used to compute
	    A = X\u
	by formula
	    A = pinv(X)*u
    */
	
    float magic[54] = {
	-1.111111111111111e-01,     2.222222222222223e-01,    -1.111111111111110e-01,     2.222222222222222e-01,  5.555555555555555e-01,     2.222222222222222e-01,    -1.111111111111112e-01,     2.222222222222222e-01, -1.111111111111111e-01,
	-1.666666666666667e-01,    -1.666666666666667e-01,    -1.666666666666669e-01,     4.625993595943901e-17,  5.251763038925035e-17,    -7.864015431086855e-17,     1.666666666666668e-01,     1.666666666666668e-01,  1.666666666666667e-01,
	-1.666666666666667e-01,     2.989343524323885e-17,     1.666666666666668e-01,    -1.666666666666668e-01, -1.466129894633581e-18,     1.666666666666668e-01,    -1.666666666666668e-01,    -2.561771598799484e-17,  1.666666666666667e-01,
	 2.499999999999986e-01,    -3.087055673664417e-16,    -2.499999999999991e-01,    -1.189598686604080e-15,  1.394017483632152e-17,     1.214208755585016e-15,    -2.500000000000011e-01,     3.049745671257349e-16,  2.500000000000016e-01,
	 1.666666666666664e-01,     1.666666666666666e-01,     1.666666666666667e-01,    -3.333333333333334e-01, -3.333333333333330e-01,    -3.333333333333331e-01,     1.666666666666665e-01,     1.666666666666666e-01,  1.666666666666667e-01,
	 1.666666666666667e-01,    -3.333333333333335e-01,     1.666666666666664e-01,     1.666666666666669e-01, -3.333333333333332e-01,     1.666666666666667e-01,     1.666666666666668e-01,    -3.333333333333333e-01,  1.666666666666665e-01
     };
     

    for (i = 0; i < end; i+=CP_BLOCK) {
	float val = maxes[i];
	if (val > max) {
	    max = val;
	    ypos = i;
	    xpos = poses[i];
	}
    }

    ypos /= CP_BLOCK;
    
    if ((max > 0.5f)&&((fabsf(xpos - center) < limit)&&(fabsf(ypos - center) < limit))) {

	    // Limit warranties we are not at the edge
	float x_offset;
	float y_offset;
	
	float *vec0 = corr + img_idx * image_pitch + (xpos - 1)*row_pitch + (ypos - 1);
	float *vec1 = corr + img_idx * image_pitch + (xpos    )*row_pitch + (ypos - 1);
	float *vec2 = corr + img_idx * image_pitch + (xpos + 1)*row_pitch + (ypos - 1);
    
	float neighbors[9] = {
	    vec0[0], vec0[1], vec0[2],	
	    vec1[0], max    , vec1[2],
	    vec2[0], vec2[1], vec2[2]
	};

	float A[6];
	for (i=0; i<6; i++) {
	    A[i] = 0;
	    for (j=0; j<9; j++) {
		A[i] += magic[i*9 + j] * neighbors[j];
	    }
	}

	x_offset = (-A[2]*A[3]+2*A[5]*A[1]) / (A[3]*A[3]-4*A[4]*A[5]);
	y_offset = -1.f / ( A[3]*A[3]-4*A[4]*A[5])*(A[3]*A[1]-2*A[4]*A[2]);

	if ((fabsf(x_offset)>1.f)||(fabsf(y_offset)>1.f)) {
	    x_offset = 0;
	    y_offset = 0;
	} else {
	    x_offset = roundf(10*x_offset)/10;
	    y_offset = roundf(10*y_offset)/10;
	}
	res1[img_idx] = ((float)xpos) + x_offset + 1 - center;
	res2[img_idx] = ((float)ypos) + y_offset + 1 - center;
    } else {
	    // DS: Still fractional offsets are computed in this case, shall we ignore that?
	res1[img_idx] = 0;
	res2[img_idx] = 0;
    }
}