/normxcorr/trunk : revision 15

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-10 13:09:44 UTC
Revision ID: csa@dside.dyndns.org-20091210130944-cppzkxvshdvg03ig

First attempt with CUDA streams

files modified:
cuda/local_sum.cu

cuda/local_sum.h

cuda/normxcorr_hw.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

155

return ERROR_CUDA_MALLOC;

156

}

157

158

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->fft_alloc_size * sizeof(cufftReal));

158

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

159

if (cuda_err) {

160

reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);

161

fftFree(ps);

162

return ERROR_CUDA_MALLOC;

163

}

164

cudaMemset((void*)ps->cuda_base_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

164

cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

165

166

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

167

if (cuda_err) {

271

}

272

273

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

274

275

for (int i = 0;i < ncp;i++) {

276

float x = data_x[i] - 1;

277

float y = data_y[i] - 1;

278

279

frac_x[i] = x - round(x * precision) / precision;

280

frac_y[i] = y - round(y * precision) / precision;

281

282

int xstart = roundf(x) - half_size;

283

int ystart = roundf(y) - half_size;

284

285

int xend = xstart + size;

286

int yend = xstart + size;

287

288

if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

289

continue;

290

}

291

292

if (check_mode) {

293

if (xstart < minx) minx = xstart;

294

if (ystart < miny) miny = ystart;

295

if (xend > maxx) maxx = xend;

296

if (yend > maxy) maxy = yend;

297

}

298

299

cudaMemcpy2D(

300

img + i * alloc_size,

301

size * sizeof(uint8_t),

302

fullimg + (xstart * height + ystart),

303

height * sizeof(uint8_t),

304

size * sizeof(uint8_t),

305

size,

306

cudaMemcpyHostToHost

307

);

308

309

cudaMemcpy2D(

310

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

311

img + i * alloc_size, size * sizeof(uint8_t),

312

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

313

);

314

315

banlist[i] = 0;

316

}

317

318

319

274

cufftReal *cuda_base_buffer = ps->cuda_base_buffer;

320

275

cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;

321

276

float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;

334

289

int lsum_size = ps->lsum_size;

335

290

int lsum_alloc = ps->lsum_alloc_size;

336

291

337

for (int i = 0;i < ncp;i++) {

338

if (banlist[i]) continue;

292

cudaStream_t stream[2];

293

for (int i = 0; i < 2; ++i) {

294

cudaStreamCreate(&stream[i]);

295

}

296

297

for (int i = 0;i <= ncp;i++) {

298

if (i < ncp) {

299

float x = data_x[i] - 1;

300

float y = data_y[i] - 1;

301

302

frac_x[i] = x - round(x * precision) / precision;

303

frac_y[i] = y - round(y * precision) / precision;

304

305

int xstart = roundf(x) - half_size;

306

int ystart = roundf(y) - half_size;

307

308

int xend = xstart + size;

309

int yend = xstart + size;

310

311

if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

312

continue;

313

}

314

315

if (check_mode) {

316

if (xstart < minx) minx = xstart;

317

if (ystart < miny) miny = ystart;

318

if (xend > maxx) maxx = xend;

319

if (yend > maxy) maxy = yend;

320

}

321

322

cudaMemcpy2D(

323

img + i * alloc_size,

324

size * sizeof(uint8_t),

325

fullimg + (xstart * height + ystart),

326

height * sizeof(uint8_t),

327

size * sizeof(uint8_t),

328

size,

329

cudaMemcpyHostToHost

330

);

331

332

cudaMemcpy2DAsync(

333

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

334

img + i * alloc_size, size * sizeof(uint8_t),

335

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,

336

stream[i%2]

337

);

338

339

banlist[i] = 0;

340

}

341

if (i > 0) {

342

int j = i - 1;

339

343

340

344

if (blocks_power < 0) {

341

vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(

342

cuda_input_buffer + i * side_alloc2, side_alloc,

343

cuda_base_buffer, fft_size,

345

vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(

346

cuda_input_buffer + j * side_alloc2, side_alloc,

347

cuda_base_buffer + j*alloc_size, fft_size,

344

348

lsum_temp + lsum_size * (lsum_alloc + 1),

345

349

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

346

350

lsum_alloc,

347

351

size, blocks

348

352

);

349

353

} else {

350

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D>>>(

351

cuda_input_buffer + i * side_alloc2, side_alloc,

352

cuda_base_buffer, fft_size,

354

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(

355

cuda_input_buffer + j * side_alloc2, side_alloc,

356

cuda_base_buffer + j*alloc_size, fft_size,

353

357

lsum_temp + lsum_size * (lsum_alloc + 1),

354

358

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

355

359

lsum_alloc,

359

363

360

364

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

361

365

local_sum(ps,

362

lsum_cache + i * alloc_size, denom_cache + i * alloc_size,

366

lsum_cache + j * alloc_size, denom_cache + j * alloc_size,

363

367

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

364

lsum_temp, lsum_temp + lsum_step);

365

366

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + i * alloc_size);

367

}

368

lsum_temp, lsum_temp + lsum_step,

369

stream[j%2]);

370

371

// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);

372

}

373

}

374

375

for (int j = 0;j < ncp;j++) {

376

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);

377

}

378

379

for (int i = 0; i < 2; ++i) {

380

cudaStreamDestroy(stream[i]);

381

}

382

383

368

384

369

385

if (check_mode) {

370

386

ps->minx = minx;

378

394

}

379

395

380

396

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

397

// return 0;

398

381

399

int width = mxGetN(image);

382

400

int height = mxGetM(image);

383

401

Older »