/normxcorr/trunk : revision 22

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/dict_image.cpp

Committer: Suren A. Chilingaryan
Date: 2009-12-15 19:23:39 UTC
Revision ID: csa@dside.dyndns.org-20091215192339-lzuciep3c2u99uc0

Optimize image reduction

files modified:
dict_hw/src/dict_image.cpp

dict_hw/src/normxcorr_hw.h

Show diffs side-by-side

added added

removed removed

dict_hw/src/dict_image.cpp

if (ps->image) free(ps->image);

}

#ifdef USE_SSE

// needs extra 32 bytes in the end of array, this is do20

static inline void reduct_ssse3(unsigned char *dst, unsigned char *src, int size) {

int i;

volatile int j; // to prevent reduction of the loop

unsigned char *end;

ALGNW char shuf[64] ALGNL;

char *shuf1 = shuf;

char *shuf2 = shuf+16;

char *shuf3 = shuf+32;

uint16_t *mul = (uint16_t*)(shuf+48);

memset(shuf, 0xFF, 48);

for (i=0;i<16;i+=4) {

shuf1[i] = i;

shuf1[i+2] = i + 1;

}

for (i=0;i<8;i+=2) {

shuf2[i] = 2 * i + 2;

shuf3[i + 8] = 2 * i + 2;

}

for (i=0;i<8;i++) {

mul[i] = 0x2AAA;

}

long aligned_size = calc_alloc(size, 32);

long val = -aligned_size;

unsigned char *src2 = src + 4 * aligned_size;

unsigned char *dst2 = dst + aligned_size;

__asm__ __volatile__ (

"prefetchnta (%1) \n\t"

"push %2 \n\t"

"movdqa (%3), %%xmm15 \n\t"

"movdqa 16(%3), %%xmm14 \n\t"

"movdqa 32(%3), %%xmm13 \n\t"

"movdqa 48(%3), %%xmm12 \n\t"

"movdqa (%1,%2,4), %%xmm0 \n\t"

"movdqa 16(%1,%2,4), %%xmm1 \n\t"

"1: \n\t"

"prefetchnta 64(%1) \n\t"

"movdqa %%xmm0, %%xmm5 \n\t"

"pshufb %%xmm15, %%xmm0 \n\t"

"movdqa %%xmm1, %%xmm6 \n\t"

"pshufb %%xmm15, %%xmm1 \n\t"

"movdqa 32(%1,%2,4), %%xmm2 \n\t"

"movdqa 48(%1,%2,4), %%xmm3 \n\t"

"pshufb %%xmm14, %%xmm5 \n\t"

"pshufb %%xmm13, %%xmm6 \n\t"

"por %%xmm6, %%xmm5 \n\t"

"phaddw %%xmm1, %%xmm0 \n\t"

"paddw %%xmm5, %%xmm0 \n\t"

"pmulhrsw %%xmm12, %%xmm0 \n\t"

//------

"movdqa %%xmm2, %%xmm5 \n\t"

"pshufb %%xmm15, %%xmm2 \n\t"

"movdqa %%xmm3, %%xmm6 \n\t"

"pshufb %%xmm15, %%xmm3 \n\t"

"movdqa 64(%1,%2,4), %%xmm7 \n\t"

100

"movdqa 80(%1,%2,4), %%xmm8 \n\t"

101

102

"pshufb %%xmm14, %%xmm5 \n\t"

103

"pshufb %%xmm13, %%xmm6 \n\t"

104

"por %%xmm6, %%xmm5 \n\t"

105

106

"phaddw %%xmm3, %%xmm2 \n\t"

107

"paddw %%xmm5, %%xmm2 \n\t"

108

"pmulhrsw %%xmm12, %%xmm2 \n\t"

109

110

//-----

111

"packuswb %%xmm2, %%xmm0 \n\t"

112

"movntps %%xmm0, (%0,%2) \n\t"

113

114

"prefetchnta 128(%1) \n\t"

115

116

"movdqa %%xmm7, %%xmm5 \n\t"

117

"pshufb %%xmm15, %%xmm7 \n\t"

118

"movdqa %%xmm8, %%xmm6 \n\t"

119

"pshufb %%xmm15, %%xmm8 \n\t"

120

121

"movdqa 96(%1,%2,4), %%xmm9 \n\t"

122

"movdqa 112(%1,%2,4), %%xmm10 \n\t"

123

124

"pshufb %%xmm14, %%xmm5 \n\t"

125

"pshufb %%xmm13, %%xmm6 \n\t"

126

"por %%xmm6, %%xmm5 \n\t"

127

128

"phaddw %%xmm8, %%xmm7 \n\t"

129

"paddw %%xmm5, %%xmm7 \n\t"

130

"pmulhrsw %%xmm12, %%xmm7 \n\t"

131

//------

132

133

"movdqa %%xmm9, %%xmm5 \n\t"

134

"pshufb %%xmm15, %%xmm9 \n\t"

135

"movdqa %%xmm10, %%xmm6 \n\t"

136

"pshufb %%xmm15, %%xmm10 \n\t"

137

138

"pshufb %%xmm14, %%xmm5 \n\t"

139

"pshufb %%xmm13, %%xmm6 \n\t"

140

"por %%xmm6, %%xmm5 \n\t"

141

142

"movdqa 128(%1,%2,4), %%xmm0 \n\t"

143

"movdqa 144(%1,%2,4), %%xmm1 \n\t"

144

145

"phaddw %%xmm10, %%xmm9 \n\t"

146

"paddw %%xmm5, %%xmm9 \n\t"

147

"pmulhrsw %%xmm12, %%xmm9 \n\t"

148

//-----

149

150

"packuswb %%xmm9, %%xmm7 \n\t"

151

"movntps %%xmm7, 16(%0,%2) \n\t"

152

153

"add $32, %2 \n\t"

154

"jnz 1b \n\t"

155

"pop %2 \n\t"

156

157

158

: "r" (dst2), "r" (src2), "r" (val), "r" (&shuf)

159

);

160

}

161

#endif /*USE_SSE */

162

163

164

int dictReduceImage(DICTContext ps, void *img, int width, int height) {

165

int size = width * height;

166

unsigned char *res = ps->image;

173

#ifdef DICT_TIFF_SUPPORT

174

case DICT_IMAGE_TIFF:

175

if (ps->matlab_mode) {

176

177

for (int i = 0; i < size; ++i) {

178

int col = i % height;

179

int lin = i / height;

int pos = 4*(col * width + lin);

res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);

180

res[i] = rintf(((float)(image[pos] + image[pos+1] + image[pos+2]))/3);

181

}

182

183

184

for (int col = 0; col < height; col++) {

185

for (int lin = 0; lin < width; lin++) {

186

int i = lin*height + col;

187

int pos = 4*(col * width + lin);

188

unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);

189

res[i] = (90 + sum + (sum>>8)) >> 8;

190

}

191

}

192

193

194

// We doing that to benefit from CPU caches

195

int aligned_width = (width / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;

196

int aligned_height = (height / TRANSPOSE_SIZE) * TRANSPOSE_SIZE;

197

198

for (int col = 0; col < aligned_height; col+=TRANSPOSE_SIZE) {

199

for (int lin = 0; lin < aligned_width; lin+=TRANSPOSE_SIZE) {

200

201

for (int col2 = 0; col2 < TRANSPOSE_SIZE; col2++) {

202

for (int lin2 = 0; lin2 < TRANSPOSE_SIZE; lin2++) {

203

int i = (lin+lin2)*height + (col+col2);

204

int pos = 4*((col+col2) * width + (lin+lin2));

205

206

unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);

207

res[i] = (90 + sum + (sum>>8)) >> 8;

208

}

209

}

210

}

211

}

212

213

for (int col = aligned_height; col < height; col++) {

214

for (int lin = aligned_width; lin < width; lin++) {

215

int i = lin*height + col;

216

int pos = 4*(col * width + lin);

217

unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);

218

res[i] = (90 + sum + (sum>>8)) >> 8;

219

}

220

}

221

} else {

222

#ifdef USE_SSE

223

reduct_ssse3(res, image, size);

224

#else /* USE_SSE */

225

for (int i = 0; i < size; ++i) {

226

int pos = 4 * i;

res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);

227

unsigned short sum = 85 * (image[pos] + image[pos+1] + image[pos+2]);

228

res[i] = (90 + sum + (sum>>8)) >> 8;

229

// res[i] = rintf(((double)(image[pos] + image[pos+1] + image[pos+2]))/3);

230

}

231

#endif /* USE_SSE */

232

}

233

break;

234

#endif /* DICT_TIFF_SUPPORT */

260

}

261

262

ps->image_type = DICT_IMAGE_TIFF;

ps->image_buf = _TIFFmalloc(w * h * sizeof (uint32_t));

ps->image = (unsigned char*)malloc(w * h * sizeof(unsigned char));

263

// we need extra 32 bytes in the end for do20 sse code

264

int alloc_size = 32 + calc_alloc(w * h, 32);

265

ps->image_buf = _TIFFmalloc(alloc_size * sizeof (uint32_t));

266

ps->image = (unsigned char*)malloc(alloc_size * sizeof(unsigned char));

267

268

ps->width = w;

269

ps->height = h;

Older »