/normxcorr/trunk : revision 7

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw_kernel.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-06 12:27:00 UTC
Revision ID: csa@dside.dyndns.org-20091206122700-88vy44g7b0bn1fi3

FindPeak optimization

files added:
dic_cpcorr3.m

files modified:
cuda/INFO

cuda/normxcorr_hw.cu

cuda/normxcorr_hw.h

cuda/normxcorr_hw_kernel.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw_kernel.cu

#define DOUBLE(x) ((double)(x))

static __global__ void vecMul(cuComplex *a, cuComplex *b, int size) {

float tmp;

}

static __global__ void stat1(int *buf1, int *buf2, uint8_t *img, int image_pitch, int row_pitch, int size) {

static __global__ void stat1(int32_t *buf1, int32_t *buf2, uint8_t *img, int image_pitch, int row_pitch, int size) {

int i;

int end = size * row_pitch;

int side_idx = blockIdx.x * blockDim.x + threadIdx.x;

int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

int sum = 0;

int sum2 = 0;

int32_t sum = 0;

int32_t sum2 = 0;

uint8_t *vec = img + img_idx * image_pitch + side_idx;

for (i = 0; i < end; i+=row_pitch) {

int val = vec[i];

int32_t val = vec[i];

sum += val;

sum2 += val*val;

}

buf2[side_idx * CP_BLOCK + img_idx] = sum2;

}

static __global__ void stat2(float *res1, float *res2, int *buf1, int *buf2, int size) {

static __global__ void stat2(float *res1, float *res2, int32_t *buf1, int32_t *buf2, int size) {

int i;

int end = size * CP_BLOCK;

int img_idx = blockIdx.x * blockDim.x + threadIdx.x;

int sum = 0;

int sum2 = 0;

int *vec1 = buf1 + img_idx;

int *vec2 = buf2 + img_idx;

int32_t *vec1 = buf1 + img_idx;

int32_t *vec2 = buf2 + img_idx;

for (i = 0; i < end; i+=CP_BLOCK) {

sum += vec1[i];

res2[img_idx] = sqrtf(fmaxf(((float)sum2) / cnt - mean*mean,0));

}

static __global__ void vecCompute(

uint16_t *items, float *res,

cufftReal *corr, float corr_scale,

float *lsum, float lsum_scale,

float *denom, float denom_scale

) {

int pos = items[threadIdx.x + blockIdx.x*BLOCK_SIZE_1D];//correct (row/column)?

res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);

100

}

101

102

103

104

static __global__ void vecCompute(

105

float *res,

106

cufftReal *corr, float corr_scale,

107

float *lsum, float lsum_scale,

108

float *denom, float denom_scale,

109

int size

110

) {

111

int pos = threadIdx.x + blockIdx.x*size;

112

113

if (denom[pos]) {

114

res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);

115

}

116

}

117

118

119

static __global__ void vecCompute(

120

float *res,

121

cufftReal *corr, float corr_scale,

132

res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);

133

}

134

100

}

101

102

static __global__ void find_max1(float *buf1, int32_t *buf2, float *corr, int image_pitch, int row_pitch, int size) {

103

int i;

104

int end = size * row_pitch;

105

106

int side_idx = blockIdx.x * blockDim.x + threadIdx.x;

107

int img_idx = blockIdx.y * blockDim.y + threadIdx.y;

108

109

float max = 0.5; // This is limit for acceptance in cpcorr

110

int32_t pos = 0;

111

112

float *vec = corr + img_idx * image_pitch + side_idx;

113

114

for (i = 0; i < end; i+=row_pitch) {

115

float val = vec[i];

116

if (val > max) {

117

max = val;

118

pos = i;

119

}

120

}

121

122

// align to remove if

123

if (side_idx < size) {

124

buf1[side_idx * CP_BLOCK + img_idx] = max;

125

buf2[side_idx * CP_BLOCK + img_idx] = pos / row_pitch;

126

}

127

}

128

129

static __global__ void find_max2(

130

float *res1, float *res2, float *buf1, int32_t *buf2,

131

float *corr, int image_pitch, int row_pitch,

132

int size, float center, float limit

133

) {

134

int i, j;

135

int end = size * CP_BLOCK;

136

int img_idx = blockIdx.x * blockDim.x + threadIdx.x;

137

138

float max = 0.5; // This is limit for acceptance in cpcorr

139

int32_t xpos = 0;

140

int32_t ypos = 0;

141

142

float *maxes = buf1 + img_idx;

143

int32_t *poses = buf2 + img_idx;

144

145

146

This is a magic number which are used to reimplement position fitting

147

using 9 neighbouring points (see findpeak.m). Thats array is a

148

Moore-Penrose pseudoinverse (pinv) of matrix "X":

149

x = [-1 -1 -1 0 0 0 1 1 1]';

150

y = [-1 0 1 -1 0 1 -1 0 1]';

151

X = [ones(9,1), x, y, x.*y, x.^2, y.^2];

152

This matrix is, then, used to compute

153

A = X\u

154

by formula

155

A = pinv(X)*u

156

157

158

float magic[54] = {

159

-1.111111111111111e-01, 2.222222222222223e-01, -1.111111111111110e-01, 2.222222222222222e-01, 5.555555555555555e-01, 2.222222222222222e-01, -1.111111111111112e-01, 2.222222222222222e-01, -1.111111111111111e-01,

160

-1.666666666666667e-01, -1.666666666666667e-01, -1.666666666666669e-01, 4.625993595943901e-17, 5.251763038925035e-17, -7.864015431086855e-17, 1.666666666666668e-01, 1.666666666666668e-01, 1.666666666666667e-01,

161

-1.666666666666667e-01, 2.989343524323885e-17, 1.666666666666668e-01, -1.666666666666668e-01, -1.466129894633581e-18, 1.666666666666668e-01, -1.666666666666668e-01, -2.561771598799484e-17, 1.666666666666667e-01,

162

2.499999999999986e-01, -3.087055673664417e-16, -2.499999999999991e-01, -1.189598686604080e-15, 1.394017483632152e-17, 1.214208755585016e-15, -2.500000000000011e-01, 3.049745671257349e-16, 2.500000000000016e-01,

163

1.666666666666664e-01, 1.666666666666666e-01, 1.666666666666667e-01, -3.333333333333334e-01, -3.333333333333330e-01, -3.333333333333331e-01, 1.666666666666665e-01, 1.666666666666666e-01, 1.666666666666667e-01,

164

1.666666666666667e-01, -3.333333333333335e-01, 1.666666666666664e-01, 1.666666666666669e-01, -3.333333333333332e-01, 1.666666666666667e-01, 1.666666666666668e-01, -3.333333333333333e-01, 1.666666666666665e-01

165

};

166

167

168

for (i = 0; i < end; i+=CP_BLOCK) {

169

float val = maxes[i];

170

if (val > max) {

171

max = val;

172

ypos = i;

173

xpos = poses[i];

174

}

175

}

176

177

ypos /= CP_BLOCK;

178

179

if ((max > 0.5f)&&((fabsf(xpos - center) < limit)&&(fabsf(ypos - center) < limit))) {

180

181

// Limit warranties we are not at the edge

182

float x_offset;

183

float y_offset;

184

185

float *vec0 = corr + img_idx * image_pitch + (xpos - 1)*row_pitch + (ypos - 1);

186

float *vec1 = corr + img_idx * image_pitch + (xpos )*row_pitch + (ypos - 1);

187

float *vec2 = corr + img_idx * image_pitch + (xpos + 1)*row_pitch + (ypos - 1);

188

189

float neighbors[9] = {

190

vec0[0], vec0[1], vec0[2],

191

vec1[0], max , vec1[2],

192

vec2[0], vec2[1], vec2[2]

193

};

194

195

float A[6];

196

for (i=0; i<6; i++) {

197

A[i] = 0;

198

for (j=0; j<9; j++) {

199

A[i] += magic[i*9 + j] * neighbors[j];

200

}

201

}

202

203

x_offset = (-A[2]*A[3]+2*A[5]*A[1]) / (A[3]*A[3]-4*A[4]*A[5]);

204

y_offset = -1.f / ( A[3]*A[3]-4*A[4]*A[5])*(A[3]*A[1]-2*A[4]*A[2]);

205

206

if ((fabsf(x_offset)>1.f)||(fabsf(y_offset)>1.f)) {

207

x_offset = 0;

208

y_offset = 0;

209

} else {

210

x_offset = roundf(10*x_offset)/10;

211

y_offset = roundf(10*y_offset)/10;

212

}

213

res1[img_idx] = ((float)xpos) + x_offset + 1 - center;

214

res2[img_idx] = ((float)ypos) + y_offset + 1 - center;

215

} else {

216

// DS: Still fractional offsets are computed in this case, shall we ignore that?

217

res1[img_idx] = 0;

218

res2[img_idx] = 0;

219

}

220

}

Older »