/normxcorr/trunk : revision 10

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-10 03:16:21 UTC
Revision ID: csa@dside.dyndns.org-20091210031621-2a15m2tdumdz3s39

Block computational kernels

files modified:
cuda/Makefile

cuda/normxcorr_hw.cu

cuda/normxcorr_hw.h

cuda/normxcorr_hw_kernel.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

#define min2(a,b) (((a)<(b))?(a):(b))

#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))

#define calc_blocks(size,rounding) (((size)/(rounding)) + (((size)%(rounding))?1:0))

static const char debruijn[32] = {

0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,

31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9

};

static TProcessingState *pstate = NULL;

static void fftFree(TProcessingState *ps) {

if (ps->cuda_fft_buffer) {

if (ps->coords) mxDestroyArray(ps->coords);

if (ps->banlist) free(ps->banlist);

cudaFree(ps->cuda_lsum_temp);

cudaFree(ps->cuda_lsum_buffer);

cudaFree(ps->cuda_denom_buffer);

cudaFree(ps->cuda_fft_buffer);

if (ps->coords) mxDestroyArray(ps->coords);

if (ps->banlist) free(ps->banlist);

if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);

if (ps->cuda_lsum_buffer) cudaFree(ps->cuda_lsum_buffer);

if (ps->cuda_denom_buffer) cudaFree(ps->cuda_denom_buffer);

if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);

cudaFree(ps->cuda_data_buffer);

cudaFree(ps->cuda_base_buffer);

cudaFree(ps->cuda_final_buffer);

cudaFree(ps->cuda_result_buffer);

cudaFree(ps->cuda_temp_buffer);

cudaFree(ps->cuda_input_buffer);

cudaFreeHost(ps->input_buffer);

cudaFree(ps->cuda_cp);

cudaFreeHost(ps->points);

}

if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);

if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);

if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);

if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);

if (ps->input_buffer) cudaFreeHost(ps->input_buffer);

if (ps->cuda_cp) cudaFree(ps->cuda_cp);

if (ps->points) cudaFreeHost(ps->points);

// DS: Source of bug, that occasionaly can corrupt something ...

if (ps->cudpp_initialized) {

ps->cudpp_initialized = true;

100

101

cuda_err = cudaMalloc((void**)&ps->cuda_fft_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

100

cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

102

101

if (cuda_err) {

103

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_buffer is failed", ps->ncp, ps->fft_alloc_size);

102

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);

104

103

fftFree(ps);

105

104

return ERROR_CUDA_MALLOC;

106

105

}

107

106

108

107

109

108

size = max3(

110

ps->fft_alloc_size * sizeof(cufftComplex), /* FFT multiplication */

109

(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */

111

110

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */

112

111

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */

113

112

);

134

133

return ERROR_CUDA_MALLOC;

135

134

}

136

135

137

cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * calc_alloc(ps->ncp, CP_BLOCK) * sizeof(float));

136

cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * ps->ncp_alloc_size * sizeof(float));

138

137

if (cuda_err) {

139

138

reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

140

139

fftFree(ps);

155

154

return ERROR_CUDA_MALLOC;

156

155

}

157

156

158

cuda_err = cudaMalloc((void**)&ps->cuda_final_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

159

if (cuda_err) {

160

reportError("Device memory allocation of %u*%u*float bytes for cuda_final_buffer is failed", ps->ncp, ps->fft_alloc_size);

161

fftFree(ps);

162

return ERROR_CUDA_MALLOC;

163

}

164

cudaMemset((void*)ps->cuda_final_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(float));

165

166

cuda_err = cudaMalloc((void**)&ps->cuda_result_buffer, ps->ncp*ps->fft_alloc_size * sizeof(cufftReal));

167

if (cuda_err) {

168

reportError("Device memory allocation of %u*cufftReal bytes for cuda_result_buffer is failed", ps->fft_alloc_size);

169

fftFree(ps);

170

return ERROR_CUDA_MALLOC;

171

}

172

173

157

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->fft_alloc_size * sizeof(cufftReal));

174

158

if (cuda_err) {

175

159

reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);

178

162

}

179

163

cudaMemset((void*)ps->cuda_base_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

180

164

181

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, ps->fft_alloc_size * sizeof(cufftReal));

165

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

182

166

if (cuda_err) {

183

reportError("Device memory allocation of %u*cufftReal bytes for cuda_data_buffer is failed", ps->fft_alloc_size);

167

reportError("Device memory allocation of %u*%u*cufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);

184

168

fftFree(ps);

185

169

return ERROR_CUDA_MALLOC;

186

170

}

187

cudaMemset((void*)ps->cuda_data_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

171

cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

188

172

189

173

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

190

174

if (cuda_err) {

370

354

// The values of TEMPLATE cannot all be the same

371

355

372

356

uint8_t *cudaInputPtr = ps->cuda_input_buffer + (icp + i) * side_alloc2;

373

cufftComplex *cudaPtr = ps->cuda_fft_buffer + (icp + i) * alloc_size;

357

cufftComplex *cudaPtr = ps->cuda_fft_cache + (icp + i) * alloc_size;

374

358

375

359

cudaMemcpy2D(

376

360

cudaInputPtr, side_alloc * sizeof(uint8_t),

435

419

}

436

420

437

421

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

422

438

423

uint8_t *img = ps->input_buffer;

424

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

425

float *cuda_data_buffer = ps->cuda_data_buffer;

439

426

440

427

uint8_t *banlist = ps->banlist + icp;

441

428

442

cufftComplex *cudaDataPtr = (cufftComplex*)ps->cuda_temp_buffer;

443

cufftReal *cudaRealPtr = ps->cuda_data_buffer;

444

445

dim3 input_block_dim(size, 1, 1);

446

dim3 input_grid_dim(size, 1, 1);

447

dim3 block_dim(fft_size / 2 + 1, 1, 1);

448

dim3 grid_dim(fft_size, 1, 1);

449

450

429

for (int i = 0;i < ncp;i++) {

451

430

float x = data_x[i] - 1;

452

431

float y = data_y[i] - 1;

472

451

cudaMemcpyHostToHost

473

452

);

474

453

475

476

cufftComplex *cudaBasePtr = ps->cuda_fft_buffer + (i+icp) * alloc_size;

477

cufftReal *cudaResultPtr = ps->cuda_result_buffer + (i+icp) * alloc_size;

478

479

uint8_t *cudaInputPtr = ps->cuda_input_buffer + i*side_alloc2;

480

481

454

cudaMemcpy2D(

482

cudaInputPtr, side_alloc * sizeof(uint8_t),

455

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

483

456

img, size * sizeof(uint8_t),

484

457

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

485

458

);

486

487

vecPack<<<input_grid_dim, input_block_dim>>>(cudaInputPtr, side_alloc, cudaRealPtr, fft_size, size);

488

489

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);

490

491

vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaBasePtr, fft_size/2+1);

492

493

cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaResultPtr);

494

459

}

495

460

496

497

int cp_blocks1, cp_blocks, side_blocks;

498

if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;

499

else cp_blocks = ncp / CP_BLOCK_SIZE;

500

501

if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;

502

else side_blocks = size / SIDE_BLOCK_SIZE;

503

461

dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

462

dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

463

464

//int input_blocks = calc_blocks(size2, BLOCK_SIZE_2D);

465

466

467

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

468

int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);

469

int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);

470

int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);

471

int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

472

473

474

// Computing sum and std

504

475

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

505

476

506

477

float *sumbuf = ps->cuda_cp + icp;

507

478

float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;

508

479

509

dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

510

dim3 joint_grid_dim(side_blocks, cp_blocks, 1);

511

512

stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

513

514

if (ncp%BLOCK_SIZE_1D) cp_blocks1 = (ncp / BLOCK_SIZE_1D) + 1;

515

else cp_blocks1 = ncp / BLOCK_SIZE_1D;

516

480

dim3 stat_grid_dim(side_blocks, cp_blocks, 1);

481

stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

517

482

stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

518

483

519

520

dim3 output_block_dim(fft_size, 1, 1);

521

dim3 output_grid_dim(fft_size, 1, 1);

522

523

for (int i = 0;i < ncp;i++) {

524

float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;

525

float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;

526

cufftReal *cudaRealPtr = ps->cuda_result_buffer + (i+icp)*alloc_size;

527

float *cudaResultPtr = ps->cuda_final_buffer + (i+icp)*alloc_size;

528

529

vecCompute<<<output_grid_dim, output_block_dim>>>(

530

cudaResultPtr,

531

cudaRealPtr, 1./(fft_size2 * (size2 - 1)),

532

cudaLSum, sumbuf+i, 1. / (size2 * (size2 - 1)),

533

cudaDenom, stdbuf+i,

534

fft_size

535

);

536

}

537

538

484

// Packing input data for FFT

485

dim3 input_grid_dim(input_blocks, cp_blocks, 1);

486

487

char side_blocks_power;

488

if ((side_blocks&(side_blocks-1))) {

489

side_blocks_power = -1;

490

} else {

491

side_blocks_power = debruijn[((uint32_t)side_blocks * 0x077CB531) >> 27];

492

}

493

// printf("power %i\n", side_blocks_power);

494

495

// cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

496

if (side_blocks_power < 0) {

497

vecPack<<<input_grid_dim, block_side_cp>>>(

498

cuda_input_buffer, side_alloc2, side_alloc,

499

cuda_data_buffer, alloc_size, fft_size,

500

size, side_blocks

501

);

502

} else {

503

vecPackFast<<<input_grid_dim, block_side_cp>>>(

504

cuda_input_buffer, side_alloc2, side_alloc,

505

cuda_data_buffer, alloc_size, fft_size,

506

size, side_blocks_power

507

);

508

}

509

510

// Performing FFT's

511

cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

512

513

for (int i = 0;i < ncp;i++) {

514

if (banlist[i]) continue;

515

cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);

516

}

517

518

int complex_blocks = calc_blocks(fft_size * (fft_size / 2 + 1), SIDE_BLOCK_SIZE);

519

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

520

vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_size/2+1);

521

522

523

// First in-place transform for some reason is failing, therefore we

524

// have one alloc_size spacing between starts

525

cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;

526

for (int i = 0;i < ncp;i++) {

527

if (banlist[i]) continue;

528

cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);

529

}

530

531

float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

532

533

int fft2_blocks = calc_blocks(fft_size*fft_size, SIDE_BLOCK_SIZE);

534

dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);

535

vecCompute<<<compute_grid_dim, block_side_cp>>>(

536

cuda_final_buffer,

537

cuda_result_buffer, 1./(fft_size2 * (size2 - 1)),

538

ps->cuda_lsum_buffer + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

539

ps->cuda_denom_buffer + icp*alloc_size, stdbuf,

540

alloc_size, fft_size

541

);

542

543

// Looking for maximum

539

544

float *xbuf = sumbuf;

540

545

float *ybuf = stdbuf;

541

546

542

547

int32_t *posbuf = (int*)ps->cuda_temp_buffer;

543

548

float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

544

549

545

int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);

546

547

dim3 result_block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

548

550

dim3 result_grid_dim(fft_blocks, cp_blocks, 1);

549

find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);

550

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

551

find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

552

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

551

553

552

554

return 0;

553

555

}

661

663

662

664

663

665

We don't really want to compute here

664

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_buffer + icp * alloc_size);

666

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_cache + icp * alloc_size);

665

667

666

668

667

669

return 0;

680

682

681

683

float *ar = (float*)mxGetPr(res);

682

684

683

cudaMemcpy(ar, ps->cuda_final_buffer + icp * alloc_size, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

685

cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;

686

float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

687

cudaMemcpy(ar, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

684

688

685

689

return res;

686

690

}

Older »