/normxcorr/trunk : revision 16

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-10 15:47:44 UTC
Revision ID: csa@dside.dyndns.org-20091210154744-min3x71y3tgrkvpu

Optimize FFT size

files modified:
automate_image.m

cuda/local_sum.cu

cuda/normxcorr_hw.cu

cuda/normxcorr_hw.h

cuda/normxcorr_hw_kernel.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9

};

static inline int next_power(int n) {

n--;

n |= n >> 1; // Divide by 2^k for consecutive doublings of k up to 32,

n |= n >> 2; // and then or the results.

n |= n >> 4;

n |= n >> 8;

n |= n >> 16;

n++; // The result is a number of 1 bits equal to the number

// of bits in the original number, plus 1. That's the

// next highest power of 2.

return n;

}

static TProcessingState *pstate = NULL;

static void fftFree(TProcessingState *ps) {

int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;

cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);

cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);

if (cufft_err) {

reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);

return ERROR_CUFFT;

}

cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_size, ps->fft_size, CUFFT_C2R);

cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_C2R);

if (cufft_err) {

reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);

cufftDestroy(ps->cufft_r2c_plan);

242

255

int half_size = 2 * ps->corr_size;

243

256

int size = 2 * half_size + 1;

244

257

245

int fft_size = ps->fft_size;

258

int fft_real_size = ps->fft_real_size;

246

259

247

260

int ncp_alloc = ps->ncp_alloc_size;

248

261

int alloc_size = ps->fft_alloc_size;

344

357

if (blocks_power < 0) {

345

358

vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(

346

359

cuda_input_buffer + j * side_alloc2, side_alloc,

347

cuda_base_buffer + j*alloc_size, fft_size,

360

cuda_base_buffer + j*alloc_size, fft_real_size,

348

361

lsum_temp + lsum_size * (lsum_alloc + 1),

349

362

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

350

363

lsum_alloc,

353

366

} else {

354

367

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(

355

368

cuda_input_buffer + j * side_alloc2, side_alloc,

356

cuda_base_buffer + j*alloc_size, fft_size,

369

cuda_base_buffer + j*alloc_size, fft_real_size,

357

370

lsum_temp + lsum_size * (lsum_alloc + 1),

358

371

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

359

372

lsum_alloc,

404

417

int size2 = size * size;

405

418

406

419

int fft_size = ps->fft_size;

407

int fft_size2 = fft_size * fft_size;

420

int fft_real_size = ps->fft_real_size;

408

421

409

422

int ncp_alloc = ps->ncp_alloc_size;

410

423

int alloc_size = ps->fft_alloc_size;

479

492

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

480

493

int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);

481

494

int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);

495

int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

482

496

int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);

483

int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

484

497

485

498

// Computing sum and std

486

499

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

507

520

if (side_blocks_power < 0) {

508

521

vecPack<<<input_grid_dim, block_side_cp>>>(

509

522

cuda_input_buffer, side_alloc2, side_alloc,

510

cuda_data_buffer, alloc_size, fft_size,

523

cuda_data_buffer, alloc_size, fft_real_size,

511

524

size, side_blocks

512

525

);

513

526

} else {

514

527

vecPackFast<<<input_grid_dim, block_side_cp>>>(

515

528

cuda_input_buffer, side_alloc2, side_alloc,

516

cuda_data_buffer, alloc_size, fft_size,

529

cuda_data_buffer, alloc_size, fft_real_size,

517

530

size, side_blocks_power

518

531

);

519

532

}

526

539

cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);

527

540

}

528

541

529

int complex_blocks = calc_blocks(fft_size * (fft_size / 2 + 1), SIDE_BLOCK_SIZE);

542

int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);

530

543

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

531

vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_size/2+1);

532

544

vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);

533

545

534

546

// First in-place transform for some reason is failing, therefore we

535

547

// have one alloc_size spacing between starts

541

553

542

554

float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

543

555

544

int fft2_blocks = calc_blocks(fft_size*fft_size, SIDE_BLOCK_SIZE);

556

// Use real size everthere

557

// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);

558

// vecCompute<<<compute_grid_dim, block_side_cp>>>(

559

// cuda_final_buffer,

560

// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),

561

// ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

562

// ps->cuda_denom_cache + icp*alloc_size, stdbuf,

563

// alloc_size

564

// );

565

566

int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;

545

567

dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);

546

568

vecCompute<<<compute_grid_dim, block_side_cp>>>(

547

cuda_final_buffer,

548

cuda_result_buffer, 1./(fft_size2 * (size2 - 1)),

569

cuda_final_buffer, fft_size,

570

cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),

549

571

ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

550

572

ps->cuda_denom_cache + icp*alloc_size, stdbuf,

551

alloc_size, fft_size

573

alloc_size, fft_blocks

552

574

);

553

575

554

576

// Looking for maximum

559

581

float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

560

582

561

583

dim3 result_grid_dim(fft_blocks, cp_blocks, 1);

584

585

// Use real size everthere

586

// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);

587

// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

562

588

find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

563

589

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

564

590

765

791

766

792

unsigned int icp;

767

793

794

int optimize;

768

795

int width, height;

769

796

int size, size2;

770

797

int base_size, base_size2;

1098

1125

plhs[0] = fftGetPoints(ps);

1099

1126

break;

1100

1127

case ACTION_SETUP:

1101

if (nrhs != 5) {

1102

reportError("SETUP action expects 'ncp', 'corrsize', and 'precision' parameters");

1128

if (nrhs != 6) {

1129

reportError("SETUP action expects 'ncp', 'corrsize', 'precision', and 'optimization level' parameters");

1103

1130

return;

1104

1131

}

1105

1132

1108

1135

ps->ncp = (int)mxGetScalar(prhs[2]);

1109

1136

1110

1137

ps->precision = (int)mxGetScalar(prhs[4]);

1138

1139

optimize = (int)mxGetScalar(prhs[5]);

1111

1140

1112

1141

iprop = (int)mxGetScalar(prhs[3]);

1113

1142

ps->corr_size = iprop;

1143

ps->subimage_size = ps->corr_size * 4 + 1;

1114

1144

ps->fft_size = 6 * iprop + 1;

1115

ps->subimage_size = ps->corr_size * 4 + 1;

1145

1146

if (optimize > 3) {

1147

ps->fft_real_size = next_power(ps->fft_size);

1148

} else {

1149

ps->fft_real_size = ps->fft_size;

1150

}

1116

1151

1117

1152

ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);

1118

1153

ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);

1119

ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

1154

1155

// ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

1156

ps->fft_alloc_size = calc_alloc(ps->fft_real_size * ps->fft_real_size, BLOCK_SIZE_1D);

1120

1157

1121

1158

ps->lsum_size = ps->corr_size * 2 + 1;

1122

1159

ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;

Older »