/normxcorr/trunk : revision 7

10

#include "normxcorr_hw_msg.h"

11

#include "normxcorr_hw_kernel.cu"

12

13

#define max2(a,b) ((a>b)?a:b)

14

#define min2(a,b) ((a<b)?a:b)

13

#define max3(a,b,c) max2(max2(a,b),c)

14

#define max2(a,b) (((a)>(b))?(a):(b))

15

#define min2(a,b) (((a)<(b))?(a):(b))

15

16

17

#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))

16

18

17

19

static TProcessingState *pstate = NULL;

18

20

109

111

}

110

112

111

113

112

size = max2(

113

ps->fft_alloc_size * sizeof(cufftComplex), /* FFT multiplication */

114

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int) /* Sum, Std computations */

114

size = max3(

115

ps->fft_alloc_size * sizeof(cufftComplex), /* FFT multiplication */

116

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */

117

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */

115

118

);

116

119

117

120

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, size);

129

132

return ERROR_CUDA_MALLOC;

130

133

}

131

134

132

cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * CP_BLOCK * sizeof(float));

135

cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * calc_alloc(ps->ncp, CP_BLOCK) * sizeof(float));

133

136

if (cuda_err) {

134

reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

137

reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

135

138

fftFree(ps);

136

139

return ERROR_CUDA_MALLOC;

137

140

}

283

286

}

284

287

285

288

286

/*

287

static int fftComputeFragment(TProcessingState *ps, int icp, const mxArray *data, float sum, float denom) {

288

// uint8_t *dataPtr;

289

// double *ar;

290

// mxArray *res;

291

292

int size = ps->fft_size;

293

int size2 = size * size;

294

int alloc_size = ps->fft_alloc_size;

295

296

int half_size = ps->corr_size;

297

int N = 2 * half_size + 1;

298

299

// int N = mxGetM(data);

300

int N2 = N * N;

301

302

303

dim3 input_block_dim(N, 1, 1);

304

dim3 input_grid_dim(N, 1, 1);

305

306

dim3 block_dim(size / 2 + 1, 1, 1);

307

dim3 grid_dim(size, 1, 1);

308

309

dim3 output_block_dim(size, 1, 1);

310

dim3 output_grid_dim(size, 1, 1);

311

312

// uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

313

// cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

314

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

315

// cufftComplex *cudaDataPtr = ps->cuda_data_buffer;

316

float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

317

318

319

dataPtr = (uint8_t*)mxGetData(data);

320

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

321

322

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);

323

324

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);

325

326

vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaPtr, size/2+1);

327

328

cudaRealPtr = ps->cuda_result_buffer;

329

cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);

330

331

332

cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;

333

334

float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;

335

float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;

336

337

vecCompute<<<output_grid_dim, output_block_dim>>>(

338

cudaResultPtr,

339

cudaRealPtr, 1./(size2 * (N2 - 1)),

340

cudaLSum, sum / (N2 * (N2 - 1)),

341

cudaDenom, denom,

342

size

343

);

344

*/

345

/*

346

int size = ps->fft_size;

347

int size2 = size * size;

348

int alloc_size = ps->fft_alloc_size;

349

350

int half_size = ps->corr_size;

351

int N = 2 * half_size + 1;

352

int N2 = N * N;

353

354

dim3 output_block_dim(size, 1, 1);

355

dim3 output_grid_dim(size, 1, 1);

356

357

cufftReal *cudaRealPtr = ps->cuda_result_buffer + icp*alloc_size;

358

float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

359

float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;

360

float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;

361

362

vecCompute<<<output_grid_dim, output_block_dim>>>(

363

cudaResultPtr,

364

cudaRealPtr, 1./(size2 * (N2 - 1)),

365

cudaLSum, sum / (N2 * (N2 - 1)),

366

cudaDenom, denom,

367

size

368

);

369

return 0;

370

}

371

*/

372

373

static inline mxArray *fftCompute(TProcessingState *ps, int icp) {

374

int size = ps->fft_size;

375

int size2 = size * size;

376

int alloc_size = ps->fft_alloc_size;

377

float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

378

379

// fftComputeFragment(ps, icp, data, sum, denom);

380

381

mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

382

float *ar = (float*)mxGetPr(res);

383

384

cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

385

386

return res;

387

}

388

389

int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

289

static inline void fftGetPoints(TProcessingState *ps, mxArray *result) {

290

cudaMemcpy2D(

291

mxGetData(result), ps->ncp * sizeof(float),

292

ps->cuda_cp, ps->ncp_alloc_size * sizeof(float),

293

ps->ncp * sizeof(float), 2,

294

cudaMemcpyDeviceToHost

295

);

296

}

297

298

/*

299

static inline mxArray *fftSetPoints(TProcessingState *ps, mxArray *result) {

300

}

301

*/

302

303

static int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

390

304

int width = mxGetN(image);

391

305

int height = mxGetM(image);

392

306

458

372

}

459

373

460

374

461

int cp_blocks, side_blocks;

375

int cp_blocks1, cp_blocks, side_blocks;

462

376

if (ncp%CP_BLOCK_SIZE) cp_blocks = (ncp / CP_BLOCK_SIZE) + 1;

463

377

else cp_blocks = ncp / CP_BLOCK_SIZE;

464

378

465

379

if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;

466

380

else side_blocks = size / SIDE_BLOCK_SIZE;

467

381

468

469

int *stat_buf = (int*)ps->cuda_data_buffer;

470

471

float *sumbuf = ps->cuda_cp;// + 2*ps->ncp + icp;

472

float *stdbuf = ps->cuda_cp + CP_BLOCK;

382

int32_t *stat_buf = (int*)ps->cuda_data_buffer;

383

384

float *sumbuf = ps->cuda_cp + icp;

385

float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;

473

386

474

387

dim3 joint_block_dim(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

475

388

dim3 joint_grid_dim(side_blocks, cp_blocks, 1);

476

389

477

390

stat1<<<joint_grid_dim, joint_block_dim>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

478

391

479

if (ncp%BLOCK_SIZE_1D) cp_blocks = (ncp / BLOCK_SIZE_1D) + 1;

480

else cp_blocks = ncp / BLOCK_SIZE_1D;

481

482

stat2<<<cp_blocks, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

483

484

/*

485

float *cp = (float*)malloc(2*CP_BLOCK*sizeof(float));

486

cudaMemcpy(cp, ps->cuda_cp, 2*CP_BLOCK*sizeof(float), cudaMemcpyDeviceToHost);

487

488

float *sumbuf1 = cp;

489

float *stdbuf1 = cp + CP_BLOCK;

490

free(cp);

491

*/

392

if (ncp%BLOCK_SIZE_1D) cp_blocks1 = (ncp / BLOCK_SIZE_1D) + 1;

393

else cp_blocks1 = ncp / BLOCK_SIZE_1D;

394

395

stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

396

492

397

493

398

dim3 output_block_dim(fft_size, 1, 1);

494

399

dim3 output_grid_dim(fft_size, 1, 1);

495

400

496

497

401

for (int i = 0;i < ncp;i++) {

498

402

float *cudaDenom = ps->cuda_denom_buffer + (i+icp)*alloc_size;

499

403

float *cudaLSum = ps->cuda_lsum_buffer + (i+icp)*alloc_size;

510

414

}

511

415

512

416

417

float *xbuf = sumbuf;

418

float *ybuf = stdbuf;

419

420

int32_t *posbuf = (int*)ps->cuda_data_buffer;

421

float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

422

423

int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);

424

425

dim3 result_block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

426

dim3 result_grid_dim(fft_blocks, cp_blocks, 1);

427

find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);

428

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

429

430

513

431

return 0;

514

432

}

515

433

434

#ifdef VALIDATE_PEAK

435

static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {

436

int size = ps->fft_size;

437

int size2 = size * size;

438

int alloc_size = ps->fft_alloc_size;

439

float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

440

441

fftLoadFragment(ps, icp, 1, image);

442

443

mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

444

float *ar = (float*)mxGetPr(res);

445

446

cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

447

448

return res;

449

}

450

#endif /* VALIDATE_PEAK */

516

451

517

452

/*

518

453

static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {

557

492

#endif /* VALIDATE_LSUM */

558

493

559

494

const mxArray *x, *y;

560

561

double input_sum;

562

double input_denom;

495

mxArray *points;

563

496

564

497

unsigned int icp;

565

498

665

598

// reportMessage("Executing normxcorr_hw action: %u", action);

666

599

667

600

switch (action) {

601

#ifdef VALIDATE_PEAK

668

602

case ACTION_COMPUTE_FRAGMENT:

669

/*

670

if (nrhs != 6) {

671

reportError("This action expects 4 arguments, but %i is passed", nrhs - 2);

672

return;

673

}

674

675

if (nlhs != 1) {

676

reportError("This action expects single ouput, but %i is passed", nlhs);

677

}

678

*/

679

680

603

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

681

/* if (icp >= ps->ncp) {

682

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

683

return;

684

}

685

686

input = prhs[3];

687

688

if (mxGetNumberOfDimensions(input) != 2) {

689

reportError("Invalid dimensionality of input matrix, 2D matrix is expected");

690

return;

691

}

692

693

if (mxGetClassID(input) != mxUINT8_CLASS) {

694

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(input));

695

return;

696

}

697

698

input_sum = mxGetScalar(prhs[4]);

699

input_denom = mxGetScalar(prhs[5]);

700

*/

701

702

plhs[0] = fftCompute(ps, icp);

604

plhs[0] = fftCompute(ps, icp, prhs[3]);

703

605

break;

606

#endif /* VALIDATE_PEAK */

704

607

case ACTION_COMPUTE:

705

608

if (nrhs != 3) {

706

609

reportError("This action expects 1 argument, but %i is passed", nrhs - 2);

802

705

memcpy(ps->data_x, mxGetData(x), ps->ncp * sizeof(float));

803

706

memcpy(ps->data_y, mxGetData(y), ps->ncp * sizeof(float));

804

707

break;

708

case ACTION_GET_POINTS:

709

if (nrhs != 2) {

710

reportError("GetPoints action do not expect any arguments");

711

return;

712

}

713

if (nlhs != 1) {

714

reportError("GetPoints action returns a single matrix");

715

return;

716

}

717

718

points = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);

719

fftGetPoints(ps, points);

720

plhs[0] = points;

721

break;

805

722

case ACTION_SETUP:

806

723

if (nrhs != 4) {

807

724

reportError("SETUP action expects 'ncp' and 'corrsize' parameters");

813

730

iprop = (int)mxGetScalar(prhs[3]);

814

731

ps->corr_size = iprop;

815

732

ps->fft_size = 6 * iprop + 1;

816

ps->fft_size2 = ps->fft_size * ps->fft_size;

817

ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);

818

819

if (ps->fft_size % SIDE_BLOCK_SIZE) ps->side_alloc_size = (ps->fft_size / SIDE_BLOCK_SIZE + 1) * SIDE_BLOCK_SIZE;

820

else ps->side_alloc_size = ps->fft_size;

821

822

823

if (ps->fft_size2 % BLOCK_SIZE_1D) {

824

ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE_1D) + 1) * BLOCK_SIZE_1D;

825

} else {

826

ps->fft_alloc_size = ps->fft_size2;

827

}

828

829

733

ps->subimage_size = ps->corr_size * 4 + 1;

734

735

ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);

736

ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);

737

ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

738

830

739

ps->lsum_size = ps->corr_size * 2 + 1;

831

832

740

ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;

833

834

if (ps->fft_size % BLOCK_SIZE_2D) {

835

ps->lsum_short_aligned_size = ((ps->fft_size / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

836

} else {

837

ps->lsum_short_aligned_size = ps->fft_size;

838

}

839

840

if ((ps->lsum_temp_size) % BLOCK_SIZE_2D) {

841

ps->lsum_aligned_size = (((ps->lsum_temp_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

842

} else {

843

ps->lsum_aligned_size = ps->lsum_temp_size;

844

}

845

846

if ((ps->lsum_temp_size + ps->lsum_size) % BLOCK_SIZE_2D) {

847

ps->lsum_alloc_size = (((ps->lsum_temp_size + ps->lsum_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

848

} else {

849

ps->lsum_alloc_size = ps->lsum_temp_size + ps->lsum_size;

850

}

851

741

742

ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);

743

ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);

744

ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);

745

852

746

err = fftInit(ps);

853

747

854

748

if (nlhs == 1) {