/normxcorr/trunk : revision 30

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/normxcorr_hw.cu.h

Committer: Suren A. Chilingaryan
Date: 2010-04-22 19:11:35 UTC
Revision ID: csa@dside.dyndns.org-20100422191135-y9o6i4cdnevm1y4j

Remove all remnants of streams, and make 3D copy in LoadBaseImage

files modified:
dict_hw/README

dict_hw/src/dict_hw.cu

dict_hw/src/normxcorr_hw.cu.h

dict_hw/src/normxcorr_hw.h

Show diffs side-by-side

added added

removed removed

dict_hw/src/normxcorr_hw.cu.h

static void fftFree(TProcessingState *ps) {

if (ps->base_image) free(ps->base_image);

if (ps->cuda_image) cudaFree(ps->cuda_image);

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

233

234

cudaError cuda_err;

234

235

int image_size = ps->width * ps->height;

235

236

237

if (ps->base_image) free(ps->base_image);

236

238

if (ps->cuda_image) cudaFree(ps->cuda_image);

237

239

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

238

240

250

252

return DICT_ERROR_CUDA_MALLOC;

251

253

}

252

254

255

ps->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));

256

if (!ps->base_image) {

257

reportError("Memory allocation of %u*%u*uint8_t bytes for base_image is failed", ps->width, ps->height);

258

fftFree(ps);

259

return DICT_ERROR_MALLOC;

260

}

261

253

262

return 0;

254

263

}

255

264

355

364

matlab_width = width;

356

365

}

357

366

358

if ((ps->use_cache)||(real_width * real_height < ps->ncp * size * size)) {

367

if (real_width * real_height < ps->ncp * size * size * MODE_COEFFICIENT) {

359

368

ps->base_mode = 1;

360

369

} else {

361

370

ps->base_mode = 0;

369

378

cudaMemcpyHostToDevice

370

379

);

371

380

// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);

372

}

381

} else if (!ps->use_cache) {

382

memcpy(ps->base_image, fullimg, width * height * sizeof(uint8_t));

383

}

373

384

374

385

return 0;

375

386

}

386

397

387

398

int half_size = 2 * ps->corr_size;

388

399

int size = 2 * half_size + 1;

400

int size2 = size * size;

389

401

390

402

int fft_real_size = ps->fft_real_size;

391

403

404

416

float *lsum_temp = (float*)ps->cuda_lsum_temp;

405

417

int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

406

418

419

uint8_t *img = ps->input_buffer;

407

420

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

408

421

cufftReal *cuda_base_buffer = ps->cuda_base_buffer;

409

422

441

454

size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice

442

455

);

443

456

} else {

444

cudaMemcpy2D(

457

/* cudaMemcpy2D(

445

458

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

446

459

fullimg + offset, matlab_width * sizeof(uint8_t),

447

460

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

448

);

449

}

450

451

452

uint8_t *img = ps->input_buffer;

453

454

if (ps->matlab_mode) {

461

);*/

462

455

463

cudaMemcpy2D(

456

img + i * alloc_size,

457

size * sizeof(uint8_t),

458

fullimg + (xstart * height + ystart),

459

height * sizeof(uint8_t),

460

size * sizeof(uint8_t),

461

size,

464

img + i * size2, size * sizeof(uint8_t),

465

fullimg + offset, matlab_width * sizeof(uint8_t),

466

size * sizeof(uint8_t), size,

462

467

cudaMemcpyHostToHost

463

468

);

464

} else {

465

cudaMemcpy2D(

466

img + i * alloc_size,

467

size * sizeof(uint8_t),

468

fullimg + (ystart * width + xstart),

469

width * sizeof(uint8_t),

470

size * sizeof(uint8_t),

471

size,

472

cudaMemcpyHostToHost

473

);

469

474

470

}

475

476

cudaMemcpy2D(

477

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

478

img + i * alloc_size, size * sizeof(uint8_t),

479

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

480

);

481

482

483

471

}

472

473

474

if (!image_mode) {

475

cudaMemcpy3DParms copy_params = { 0 };

476

477

copy_params.dstPtr = make_cudaPitchedPtr(

478

cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc

479

);

480

copy_params.srcPtr = make_cudaPitchedPtr(

481

img, size * sizeof(uint8_t), size, size

482

);

483

copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);

484

copy_params.kind = cudaMemcpyHostToDevice;

485

486

cudaMemcpy3D(&copy_params);

487

}

488

489

490

for (int i = 0;i < ncp;i++) {

491

if (banlist[i]) continue;

492

484

493

if (ps->base_blocks_power < 0) {

485

494

vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(

486

495

cuda_input_buffer + i * side_alloc2, side_alloc,

585

594

matlab_width = width;

586

595

}

587

596

588

if (real_width * real_height < ps->ncp * size * size) {

597

if (real_width * real_height < ps->ncp * size * size * MODE_COEFFICIENT) {

589

598

ps->mode = 1;

590

599

} else {

591

600

ps->mode = 0;

592

601

}

593

//ps->mode = 0;

594

602

595

603

if (ps->mode) {

596

604

cudaMemcpy2D(

664

672

size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice

665

673

);

666

674

} else {

667

#ifdef WIN32

668

cudaMemcpy2D(

675

/*cudaMemcpy2D(

669

676

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

670

677

fullimg + offset, matlab_width * sizeof(uint8_t),

671

678

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

672

);

673

#else /* WIN32 */

679

);*/

680

674

681

cudaMemcpy2D(

675

682

img + i * size2, size * sizeof(uint8_t),

676

683

fullimg + offset, matlab_width * sizeof(uint8_t),

677

684

size * sizeof(uint8_t), size,

678

685

cudaMemcpyHostToHost

679

686

);

680

681

cudaMemcpy2D(

682

cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),

683

img + i * size2, size * sizeof(uint8_t),

684

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

685

);

686

687

#endif /* WIN32 */

688

687

}

689

688

690

689

}

691

690

692

#ifndef WIN32

693

691

if (!image_mode) {

694

692

cudaMemcpy3DParms copy_params = { 0 };

695

693

704

702

705

703

cudaMemcpy3D(&copy_params);

706

704

}

707

#endif /* !WIN32 */

708

705

709

706

return 0;

710

707

}

713

710

static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

714

711

static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

715

712

716

static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

713

static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp) {

717

714

int half_size = ps->corr_size;

718

715

int size = 2 * half_size + 1;

719

716

738

735

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

739

736

740

737

dim3 stat_grid_dim(side_blocks, cp_blocks, 1);

741

stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);

742

stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

738

stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);

739

stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

743

740

744

741

// Packing input data for FFT

745

742

dim3 input_grid_dim(input_blocks, cp_blocks, 1);

746

743

747

744

if (ps->side_blocks_power < 0) {

748

vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(

745

vecPack<<<input_grid_dim, block_side_cp>>>(

749

746

cuda_input_buffer, side_alloc2, side_alloc,

750

747

cuda_data_buffer, alloc_size, fft_real_size,

751

748

size, side_blocks

752

749

);

753

750

} else {

754

vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(

751

vecPackFast<<<input_grid_dim, block_side_cp>>>(

755

752

cuda_input_buffer, side_alloc2, side_alloc,

756

753

cuda_data_buffer, alloc_size, fft_real_size,

757

754

size, ps->side_blocks_power

761

758

return 0;

762

759

}

763

760

764

static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

761

static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp) {

765

762

int half_size = ps->corr_size;

766

763

int size = 2 * half_size + 1;

767

764

int size2 = size * size;

787

784

788

785

// Use real size everthere

789

786

// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);

790

// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(

787

// vecCompute<<<compute_grid_dim, block_side_cp>>>(

791

788

// cuda_final_buffer,

792

789

// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),

793

790

// ps->cuda_lsum_cache + cache_icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

799

796

int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;

800

797

dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);

801

798

802

vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(

799

vecCompute<<<compute_grid_dim, block_side_cp>>>(

803

800

cuda_final_buffer, fft_size,

804

801

cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),

805

802

ps->cuda_lsum_cache + cache_icp * alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

821

818

// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);

822

819

// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

823

820

824

find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

825

find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

821

find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

822

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

826

823

827

824

return 0;

828

825

}

829

826

830

static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

827

static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp) {

831

828

int fft_real_size = ps->fft_real_size;

832

829

833

830

int alloc_size = ps->fft_alloc_size;

841

838

// Performing FFT's

842

839

cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

843

840

844

cufftSetStream(ps->cufft_r2c_plan, stream0);

845

cufftSetStream(ps->cufft_c2r_plan, stream0);

846

847

841

for (int i = 0;i < ncp;i++) {

848

842

if (banlist[i]) continue;

849

843

cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);

851

845

852

846

int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);

853

847

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

854

vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);

848

vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);

855

849

856

850

// First in-place transform for some reason is failing, therefore we

857

851

// have one alloc_size spacing between starts (see cuda_fft_buffer set above)

Older »