/normxcorr/trunk : revision 19

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/dict_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-12 01:38:41 UTC
Revision ID: csa@dside.dyndns.org-20091212013841-feih3qa4i28x75j4

Provide stand-alone library

files added:
dict_hw/CMakeLists.txt

dict_hw/cmake

dict_hw/cmake/CheckPythonModule.cmake

dict_hw/cmake/FindCUDA.cmake

dict_hw/cmake/FindCUDPP.cmake

dict_hw/cmake/FindFFTW3.cmake

dict_hw/cmake/FindGLIB2.cmake

dict_hw/cmake/FindMKL.cmake

dict_hw/cmake/MakePythonArray.cmake

dict_hw/cmake/ParseArguments.cmake

dict_hw/cmake/make2cmake.cmake

dict_hw/cmake/parse_cubin.cmake

dict_hw/cmake/run_nvcc.cmake

dict_hw/matlab

dict_hw/matlab/normxcorr_hw.cu

dict_hw/matlab/normxcorr_hw_msg.h

dict_hw/src

dict_hw/src/CMakeLists.txt

dict_hw/src/dict_hw.cu

dict_hw/src/dict_hw.h

dict_hw/src/helpers.h

dict_hw/src/validate.cpp

files removed:
cuda/local_sum.h

files renamed:
cuda/ => dict_hw/

cuda/INFO => dict_hw/README

cuda/Makefile => dict_hw/matlab/Makefile

cuda/nvmex => dict_hw/matlab/nvmex

cuda/nvopts.sh => dict_hw/matlab/nvopts.sh

cuda/local_sum.cu => dict_hw/src/local_sum.cu.h

cuda/local_sum_kernel.cu => dict_hw/src/local_sum_kernel.cu.h

cuda/normxcorr_hw.cu => dict_hw/src/normxcorr_hw.cu.h

cuda/normxcorr_hw.h => dict_hw/src/normxcorr_hw.h

cuda/normxcorr_hw_kernel.cu => dict_hw/src/normxcorr_hw_kernel.cu.h

cuda/normxcorr_hw_msg.h => dict_hw/src/normxcorr_hw_msg.h

files modified:
.bzrignore

automate_image.m

Show diffs side-by-side

added added

removed removed

dict_hw/src/dict_hw.cu

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

#include "dict_hw.h"

#include "helpers.h"

#include "local_sum.cu.h"

#include "normxcorr_hw.h"

#include "normxcorr_hw.cu.h"

#define MAX_DEVICES 16

static int device_number = 0;

static int devices[MAX_DEVICES];

int dictDetectHardware() {

int deviceCount;

cudaDeviceProp deviceProp;

cudaGetDeviceCount(&deviceCount);

if (!deviceCount) return -1;

for (int i = 0; i < deviceCount; i++) {

cudaGetDeviceProperties(&deviceProp, i);

if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {

devices[device_number++] = i;

}

return device_number;

}

DICTContext dictCreateContext() {

if (!device_number) {

if (dictDetectHardware() <= 0) return NULL;

}

TProcessingState *pstate = pstateInit();

return pstate;

}

void dictDestroyContext(DICTContext ctx) {

pstateFree(ctx);

}

int dictSetLogger(DICTLogger error_reporter, DICTLogger message_writer) {

reportError = error_reporter;

reportMessage = message_writer;

}

int dictSetup(DICTContext ps, int ncp, int corr_size, int precision, DICTFlags flags) {

int base_blocks, side_blocks;

fftFree(ps);

ps->ncp = ncp;

ps->corr_size = corr_size;

ps->precision = precision;

ps->subimage_size = ps->corr_size * 4 + 1;

ps->fft_size = 6 * corr_size + 1;

if (flags&DICT_FLAGS_FIXED_FFT_SIZE) {

ps->fft_real_size = ps->fft_size;

} else {

ps->fft_real_size = next_power(ps->fft_size);

}

ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);

ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);

ps->fft_alloc_size = calc_alloc(ps->fft_real_size * ps->fft_real_size, BLOCK_SIZE_1D);

ps->lsum_size = ps->corr_size * 2 + 1;

ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;

ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);

ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);

ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);

side_blocks = calc_blocks(2 * ps->corr_size + 1, SIDE_BLOCK_SIZE);

ps->side_blocks_power = get_power(side_blocks);

base_blocks = calc_blocks(4 * ps->corr_size + 1, BLOCK_SIZE_1D);

ps->base_blocks_power = get_power(base_blocks);

return fftInit(ps);

}

int dictSetTemplatePoints(DICTContext ps, const float *points_x, const float *points_y) {

memcpy(ps->points, points_x, ps->ncp * sizeof(float));

memcpy(ps->points + ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));

return 0;

}

100

101

int dictSetDimensions(DICTContext ps, int width, int height) {

102

ps->width = width;

103

ps->height = height;

104

105

return 0;

106

}

107

108

int dictSetPointsBuffer(DICTContext ps, float *point_x, float *point_y) {

109

ps->res_x = point_x;

110

ps->res_y = point_y;

111

112

return 0;

113

}

114

115

int dictSetCurrentPoints(DICTContext ps, const float *points_x, const float *points_y) {

116

memcpy(ps->points + 2 * ps->ncp_alloc_size, points_x, ps->ncp * sizeof(float));

117

memcpy(ps->points + 3 * ps->ncp_alloc_size, points_y, ps->ncp * sizeof(float));

118

119

ps->stored = 0;

120

121

return 0;

122

}

123

124

int dictCompute(DICTContext ps) {

125

return fftGetCurrentPoints(ps);

126

}

127

128

int dictGetCurrentPoints(DICTContext ps, float *res_x, float *res_y) {

129

int err;

130

131

err = fftGetCurrentPoints(ps);

132

if (err) return err;

133

134

if ((res_x)&&(res_x != ps->res_x)) {

135

float *data_x;

136

if (ps->stored) data_x = ps->res_x;

137

else data_x = ps->points + 2 * ps->ncp_alloc_size;

138

139

memcpy(res_x, data_x, ps->ncp * sizeof(float));

140

}

141

142

if ((res_y)&&(res_y != ps->res_y)) {

143

float *data_y;

144

if (ps->stored) data_y = ps->res_y;

145

else data_y = ps->points + 3 * ps->ncp_alloc_size;

146

147

memcpy(res_y, data_y, ps->ncp * sizeof(float));

148

}

149

150

return 0;

151

}

152

153

154

int dictLoadTemplateFragment(DICTContext ps, int icp, int ncp, const unsigned char *img) {

155

return fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);

156

}

157

158

int dictLoadTemplateImage(DICTContext ps, const unsigned char *img, int width, int height) {

159

int err;

160

161

ps->width = width;

162

ps->height = height;

163

164

int size = 2 * ps->corr_size + 1;

165

int size2 = size * size;

166

167

int base_size = 4 * ps->corr_size + 1;

168

int base_size2 = base_size * base_size;

169

170

if (width * height > ps->ncp * size2) {

171

ps->mode = 0;

172

} else {

173

ps->mode = 1;

174

}

175

176

// if not enoguh space for caching enable anyway ?

177

if (width * height > ps->ncp * base_size2) {

178

ps->base_mode = 0;

179

} else {

180

ps->base_mode = 1;

181

if (!ps->mode) {

182

ps->minx = 0;

183

ps->maxx = width - 1;

184

ps->miny = 0;

185

ps->maxy = height - 1;

186

}

187

}

188

189

for (int icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

190

err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);

191

if (err) break;

192

}

193

194

if ((ps->base_mode)&&(!ps->mode)) {

195

// printf("%ux%u\n", width, height);

196

197

// Correcting difference of area size between base and data images

198

ps->minx += ps->corr_size;

199

ps->miny += ps->corr_size;

200

ps->maxx -= ps->corr_size;

201

ps->maxy -= ps->corr_size;

202

203

width = ceil(ps->maxx) - floor(ps->minx);

204

height = ceil(ps->maxy) - floor(ps->miny);

205

206

// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);

207

if (width * height < ps->ncp * size2) {

208

ps->mode = 1;

209

}

210

}

211

212

if (ps->mode) {

213

reportMessage("Running in the image mode");

214

} else {

215

reportMessage("Running in the fragment mode");

216

}

217

218

return 0;

219

}

220

221

222

int dictLoadFragment(DICTContext ps, int icp, int ncp, const unsigned char *input) {

223

int err;

224

cudaStream_t stream = NULL;

225

226

227

err = fftCopyFragment(ps, icp, ncp, input);

228

if (err) return err;

229

230

err = fftLoadFragment(ps, icp, ncp, input, stream);

231

if (err) return err;

232

233

err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

234

if (err) return err;

235

236

err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

237

if (err) return err;

238

239

err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

240

if (err) return err;

241

242

return 0;

243

}

244

245

int dictLoadImage(DICTContext ps, unsigned char *img) {

246

int err;

247

int ncp = ps->ncp;

248

249

250

#ifdef DICT_HW_MEASURE_TIMINGS

251

int time[16];

252

struct timeval tv1, tv2;

253

gettimeofday(&tv1, NULL);

254

gettimeofday(&tv2, NULL);

255

time[0] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec);

256

printf("Pre: %li, Nope: %li, Comp: %li, Post: %li, Copy/Load: %li, -: %li\n", time[0], time[1], time[2], time[3], time[4], time[5]);

257

258

cudaStream_t stream[2];

259

for (int i = 0; i < 2; ++i) {

260

cudaStreamCreate(&stream[i]);

261

}

262

for (int i = 0; i < 2; ++i) {

263

cudaStreamDestroy(stream[i]);

264

}

265

#endif

266

267

268

for (int icp = 0; icp < ncp; icp+=CP_BLOCK) {

269

err = dictLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), img);

270

if (err) return err;

271

}

272

273

274

err = fftCopyFragment(ps, icp, ncp, input);

275

if (err) return err;

276

277

err = fftLoadFragment(ps, icp, ncp, input, stream);

278

if (err) return err;

279

280

err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

281

if (err) return err;

282

283

err = fftProcessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

284

if (err) return err;

285

286

err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream);

287

if (err) return err;

288

}

289

290

291

292

293

err = fftCopyFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);

294

err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input, stream[0], NULL);

295

for (int i = 0; icp < ps->ncp; icp+=CP_BLOCK,i++) {

296

err = fftPreprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);

297

err = fftComputeFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);

298

err = fftPostprocessFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), stream[i%2], NULL);

299

if (next_icp < ps->ncp) {

300

err = fftCopyFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);

301

err = fftLoadFragment(ps, next_icp, min2(CP_BLOCK, ps->ncp - next_icp), input, stream[(i+1)%2], NULL);

302

}

303

}

304

305

306

return 0;

307

}

308

309

310

int dictProcessImage(DICTContext ps, unsigned char *img) {

311

int err = dictLoadImage(ps, img);

312

if (err) return err;

313

return dictCompute(ps);

314

}

Older »