/normxcorr/trunk : revision 27

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/normxcorr_hw.cu.h

Committer: Suren A. Chilingaryan
Date: 2010-04-17 07:43:38 UTC
Revision ID: csa@dside.dyndns.org-20100417074338-l45lxndn5tv72bpx

Patches for windows port

files added:
dict_hw/matlab/compile64.bat

dict_hw/src/win

dict_hw/src/win/getopt.c

dict_hw/src/win/getopt.h

dict_hw/src/win/getopt_long.c

files renamed:
dict_hw/matlab/normxcorr_hw.cu => dict_hw/matlab/normxcorr_hw.c

files modified:
dict_hw/CMakeLists.txt

dict_hw/cmake/FindCUDA.cmake

dict_hw/cmake/FindCUDPP.cmake

dict_hw/src/CMakeLists.txt

dict_hw/src/dict_hw.cu *

dict_hw/src/dict_hw_test.cpp

dict_hw/src/dict_image.cpp

dict_hw/src/normxcorr_hw.cu.h

dict_hw/src/normxcorr_hw_msg.cpp

dict_hw/src/validate.cpp

Show diffs side-by-side

added added

removed removed

dict_hw/src/normxcorr_hw.cu.h

252

int lsum_size = ps->lsum_size;

253

int lsum_alloc = ps->lsum_alloc_size;

254

255

cudaStream_t stream[2];

256

for (int i = 0; i < 2; ++i) {

257

cudaStreamCreate(&stream[i]);

258

}

259

255

260

for (int i = 0;i <= ncp;i++) {

261

if (i < ncp) {

256

for (int i = 0;i < ncp;i++) {

262

257

float x = data_x[i] - 1;

263

258

float y = data_y[i] - 1;

264

259

304

299

);

305

300

}

306

301

307

cudaMemcpy2DAsync(

302

cudaMemcpy2D(

308

303

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

309

304

img + i * alloc_size, size * sizeof(uint8_t),

310

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,

311

stream[i%2]

305

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

312

306

);

313

307

314

308

banlist[i] = 0;

315

}

316

if (i > 0) {

317

int j = i - 1;

309

{

310

int j = i ;

318

311

319

312

if (ps->base_blocks_power < 0) {

320

vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(

313

vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(

321

314

cuda_input_buffer + j * side_alloc2, side_alloc,

322

315

cuda_base_buffer + j*alloc_size, fft_real_size,

323

316

lsum_temp + lsum_size * (lsum_alloc + 1),

326

319

size, blocks

327

320

);

328

321

} else {

329

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(

322

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D>>>(

330

323

cuda_input_buffer + j * side_alloc2, side_alloc,

331

324

cuda_base_buffer + j*alloc_size, fft_real_size,

332

325

lsum_temp + lsum_size * (lsum_alloc + 1),

340

333

local_sum(ps,

341

334

lsum_cache + j * alloc_size, denom_cache + j * alloc_size,

342

335

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

343

lsum_temp, lsum_temp + lsum_step,

344

stream[j%2]);

336

lsum_temp, lsum_temp + lsum_step, NULL);

345

337

346

338

// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);

347

339

}

351

343

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);

352

344

}

353

345

354

for (int i = 0; i < 2; ++i) {

355

cudaStreamDestroy(stream[i]);

356

}

357

358

346

if (check_mode) {

359

347

ps->minx = minx;

360

348

ps->maxx = maxx;

429

417

}

430

418

431

419

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {

420

int i;

432

421

int half_size = ps->corr_size;

433

422

int size = 2 * half_size + 1;

434

423

437

426

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

438

427

uint8_t *img = ps->input_buffer;

439

428

429

430

for (i = 0;i < ncp;i++) {

431

cudaMemcpy2D(

432

cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),

433

img + i * size * size, size * sizeof(uint8_t),

434

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

435

);

436

}

437

440

438

441

for (int i = 0;i < ncp;i++) {

442

cudaMemcpy2D(

443

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

444

img + i * size2, size * sizeof(uint8_t),

445

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

446

);

447

}

448

449

450

439

cudaMemcpy3DParms copy_params = { 0 };

451

440

452

441

copy_params.dstPtr = make_cudaPitchedPtr(

459

448

copy_params.kind = cudaMemcpyHostToDevice;

460

449

461

450

cudaMemcpy3DAsync(&copy_params, stream0);

451

462

452

463

453

return 0;

464

454

}

Older »