/normxcorr/trunk : revision 19

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/normxcorr_hw.cu.h

Committer: Suren A. Chilingaryan
Date: 2009-12-12 01:38:41 UTC
Revision ID: csa@dside.dyndns.org-20091212013841-feih3qa4i28x75j4

Provide stand-alone library

files added:
dict_hw/CMakeLists.txt

dict_hw/cmake

dict_hw/cmake/CheckPythonModule.cmake

dict_hw/cmake/FindCUDA.cmake

dict_hw/cmake/FindCUDPP.cmake

dict_hw/cmake/FindFFTW3.cmake

dict_hw/cmake/FindGLIB2.cmake

dict_hw/cmake/FindMKL.cmake

dict_hw/cmake/MakePythonArray.cmake

dict_hw/cmake/ParseArguments.cmake

dict_hw/cmake/make2cmake.cmake

dict_hw/cmake/parse_cubin.cmake

dict_hw/cmake/run_nvcc.cmake

dict_hw/matlab

dict_hw/matlab/normxcorr_hw.cu

dict_hw/matlab/normxcorr_hw_msg.h

dict_hw/src

dict_hw/src/CMakeLists.txt

dict_hw/src/dict_hw.cu

dict_hw/src/dict_hw.h

dict_hw/src/helpers.h

dict_hw/src/validate.cpp

files removed:
cuda/local_sum.h

files renamed:
cuda/ => dict_hw/

cuda/INFO => dict_hw/README

cuda/Makefile => dict_hw/matlab/Makefile

cuda/nvmex => dict_hw/matlab/nvmex

cuda/nvopts.sh => dict_hw/matlab/nvopts.sh

cuda/local_sum.cu => dict_hw/src/local_sum.cu.h

cuda/local_sum_kernel.cu => dict_hw/src/local_sum_kernel.cu.h

cuda/normxcorr_hw.cu => dict_hw/src/normxcorr_hw.cu.h

cuda/normxcorr_hw.h => dict_hw/src/normxcorr_hw.h

cuda/normxcorr_hw_kernel.cu => dict_hw/src/normxcorr_hw_kernel.cu.h

cuda/normxcorr_hw_msg.h => dict_hw/src/normxcorr_hw_msg.h

files modified:
.bzrignore

automate_image.m

Show diffs side-by-side

added added

removed removed

dict_hw/src/normxcorr_hw.cu.h

#include <stdio.h>

#include <stdlib.h>

#include "normxcorr_hw.h"

#include "local_sum.h"

#include "normxcorr_hw_msg.h"

#include "normxcorr_hw_kernel.cu"

#define max4(a,b,c,d) max2(max2(a,b),max2(c,d))

#define max3(a,b,c) max2(max2(a,b),c)

#define max2(a,b) (((a)>(b))?(a):(b))

#define min2(a,b) (((a)<(b))?(a):(b))

#define calc_alloc(size,rounding) ((((size)/(rounding)) + (((size)%(rounding))?1:0))*(rounding))

#define calc_blocks(size,rounding) (((size)/(rounding)) + (((size)%(rounding))?1:0))

static const char debruijn[32] = {

0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,

31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9

};

static inline int next_power(int n) {

n--;

n |= n >> 1; // Divide by 2^k for consecutive doublings of k up to 32,

n |= n >> 2; // and then or the results.

n |= n >> 4;

n |= n >> 8;

n |= n >> 16;

n++; // The result is a number of 1 bits equal to the number

// of bits in the original number, plus 1. That's the

// next highest power of 2.

return n;

}

static TProcessingState *pstate = NULL;

#include "normxcorr_hw_kernel.cu.h"

static void fftFree(TProcessingState *ps) {

if (ps->coords) mxDestroyArray(ps->coords);

if (ps->banlist) free(ps->banlist);

if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);

if (ps->cuda_points) cudaFree(ps->cuda_points);

if (ps->points) cudaFreeHost(ps->points);

// DS: Source of bug, that occasionaly can corrupt something ...

if (ps->cudpp_initialized) {

cudppDestroyPlan(ps->cudpp_plan);

}

#include <unistd.h>

static int fftInit(TProcessingState *ps) {

CUDPPConfiguration cudpp_config;

cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);

if (cufft_err) {

reportError("Problem initializing c2r plan, cufft code: %i", cufft_err);

return ERROR_CUFFT;

return DICT_ERROR_CUFFT;

}

cufft_err = cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_C2R);

if (cufft_err) {

reportError("Problem initializing r2c plan, cufft code: %i", cufft_err);

cufftDestroy(ps->cufft_r2c_plan);

return ERROR_CUFFT;

return DICT_ERROR_CUFFT;

}

ps->fft_initialized = true;

106

if (cudpp_err != CUDPP_SUCCESS) {

107

reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);

108

fftFree(ps);

109

return ERROR_CUDPP;

return DICT_ERROR_CUDPP;

110

}

111

112

ps->cudpp_initialized = true;

115

if (cuda_err) {

116

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);

117

fftFree(ps);

118

return ERROR_CUDA_MALLOC;

return DICT_ERROR_CUDA_MALLOC;

119

}

120

121

129

if (cuda_err) {

130

reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);

131

fftFree(ps);

132

return ERROR_CUDA_MALLOC;

return DICT_ERROR_CUDA_MALLOC;

133

}

134

135

ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));

136

100

if (!ps->banlist) {

137

101

reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);

138

102

fftFree(ps);

139

return ERROR_MALLOC;

103

return DICT_ERROR_MALLOC;

140

104

}

141

105

memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));

142

106

144

108

if (cuda_err) {

145

109

reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);

146

110

fftFree(ps);

147

return ERROR_CUDA_MALLOC;

111

return DICT_ERROR_CUDA_MALLOC;

148

112

}

149

113

150

114

cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));

151

115

if (cuda_err) {

152

116

reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);

153

117

fftFree(ps);

154

return ERROR_CUDA_MALLOC;

118

return DICT_ERROR_CUDA_MALLOC;

155

119

}

156

120

157

121

cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));

158

122

if (cuda_err) {

159

123

reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);

160

124

fftFree(ps);

161

return ERROR_CUDA_MALLOC;

125

return DICT_ERROR_CUDA_MALLOC;

162

126

}

163

127

164

128

cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);

165

129

if (cuda_err) {

166

130

reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);

167

131

fftFree(ps);

168

return ERROR_CUDA_MALLOC;

132

return DICT_ERROR_CUDA_MALLOC;

169

133

}

170

134

171

135

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

172

136

if (cuda_err) {

173

137

reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);

174

138

fftFree(ps);

175

return ERROR_CUDA_MALLOC;

139

return DICT_ERROR_CUDA_MALLOC;

176

140

}

177

141

cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

178

142

180

144

if (cuda_err) {

181

145

reportError("Device memory allocation of %u*%u*cufftReal bytes for cuda_data_buffer is failed", CP_BLOCK, ps->fft_alloc_size);

182

146

fftFree(ps);

183

return ERROR_CUDA_MALLOC;

147

return DICT_ERROR_CUDA_MALLOC;

184

148

}

185

149

cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

186

150

188

152

if (cuda_err) {

189

153

reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);

190

154

fftFree(ps);

191

return ERROR_CUDA_MALLOC;

155

return DICT_ERROR_CUDA_MALLOC;

192

156

}

193

157

194

158

cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

195

159

if (cuda_err) {

196

160

reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);

197

161

fftFree(ps);

198

return ERROR_CUDA_MALLOC;

162

return DICT_ERROR_CUDA_MALLOC;

199

163

}

200

164

201

165

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2 * sizeof(float));

202

166

if (cuda_err) {

203

167

reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);

204

168

fftFree(ps);

205

return ERROR_MALLOC;

169

return DICT_ERROR_MALLOC;

206

170

}

207

171

// We need to zero temporary buffers as well, since we are not computing

208

172

// cumsum of complete matrix, but non-zero part of it

209

173

cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));

210

211

ps->coords = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);

212

if (ps->coords) {

213

mexMakeArrayPersistent(ps->coords);

214

} else {

215

reportError("Allocation of Matlab matrix of size %u*float bytes is failed", ps->ncp);

216

fftFree(ps);

217

return ERROR_MALLOC;

218

}

219

220

//mexMakeMemoryPersistent(ps->coords);

221

//mexLock() mexUnlock()

222

174

223

175

return 0;

224

176

}

225

177

226

static void fftPrepare(TProcessingState *ps) {

227

}

228

229

230

178

231

179

void pstateFree(TProcessingState *ps) {

232

180

if (ps) {

240

188

241

189

ps = (TProcessingState*)malloc(sizeof(TProcessingState));

242

190

if (ps) memset(ps, 0, sizeof(TProcessingState));

191

243

192

return ps;

244

193

}

245

194

246

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

247

int width = mxGetN(image);

248

int height = mxGetM(image);

195

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {

196

int width = ps->width;

197

int height = ps->height;

249

198

250

199

int check_mode = ((ps->base_mode)&&(!ps->mode));

251

200

float minx, miny, maxx, maxy;

270

219

float *frac_x = ps->points + 4 * ncp_alloc + icp;

271

220

float *frac_y = frac_x + ncp_alloc;

272

221

273

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

274

222

uint8_t *img = ps->input_buffer;

275

223

276

224

float *lsum_temp = (float*)ps->cuda_lsum_temp;

386

334

cudaStreamDestroy(stream[i]);

387

335

}

388

336

389

390

391

337

if (check_mode) {

392

338

ps->minx = minx;

393

339

ps->maxx = maxx;

399

345

return 0;

400

346

}

401

347

402

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

403

// return 0;

404

405

int width = mxGetN(image);

406

int height = mxGetM(image);

348

349

static inline int fftCopyFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {

350

int width = ps->width;

351

int height = ps->height;

407

352

408

353

int half_size = ps->corr_size;

409

354

int size = 2 * half_size + 1;

410

355

int size2 = size * size;

411

412

int fft_size = ps->fft_size;

413

int fft_real_size = ps->fft_real_size;

414

415

356

int ncp_alloc = ps->ncp_alloc_size;

416

int alloc_size = ps->fft_alloc_size;

417

int side_alloc = ps->side_alloc_size;

418

int side_alloc2 = side_alloc * side_alloc;

419

357

420

358

float *data_x, *data_y;

421

359

if (ps->stored) {

422

data_x = ((float*)mxGetData(ps->coords)) + icp;

423

data_y = data_x + ps->ncp;

360

data_x = ps->res_x + icp;

361

data_y = ps->res_y + icp;

424

362

} else {

425

363

data_x = ps->points + 2 * ncp_alloc + icp;

426

364

data_y = data_x + ncp_alloc;

427

365

}

428

366

429

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

430

431

367

uint8_t *img = ps->input_buffer;

432

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

433

float *cuda_data_buffer = ps->cuda_data_buffer;

434

435

368

uint8_t *banlist = ps->banlist + icp;

436

369

437

370

for (int i = 0;i < ncp;i++) {

458

391

size,

459

392

cudaMemcpyHostToHost

460

393

);

394

}

395

return 0;

396

}

397

398

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {

399

int half_size = ps->corr_size;

400

int size = 2 * half_size + 1;

401

402

int side_alloc = ps->side_alloc_size;

403

404

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

405

uint8_t *img = ps->input_buffer;

406

461

407

408

for (int i = 0;i < ncp;i++) {

462

409

cudaMemcpy2D(

463

410

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

464

411

img + i * size2, size * sizeof(uint8_t),

465

412

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

466

413

);

414

}

467

415

468

}

469

416

470

417

cudaMemcpy3DParms copy_params = { 0 };

418

471

419

copy_params.dstPtr = make_cudaPitchedPtr(

472

420

cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc

473

421

);

476

424

);

477

425

copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);

478

426

copy_params.kind = cudaMemcpyHostToDevice;

479

cudaMemcpy3D(&copy_params);

480

481

482

dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

483

dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

427

428

cudaMemcpy3DAsync(&copy_params, stream0);

429

430

return 0;

431

}

432

433

static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

434

static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

435

436

static inline int fftPreprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

437

int half_size = ps->corr_size;

438

int size = 2 * half_size + 1;

439

440

int fft_real_size = ps->fft_real_size;

441

442

int ncp_alloc = ps->ncp_alloc_size;

443

int alloc_size = ps->fft_alloc_size;

444

int side_alloc = ps->side_alloc_size;

445

int side_alloc2 = side_alloc * side_alloc;

446

447

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

448

float *cuda_data_buffer = ps->cuda_data_buffer;

484

449

485

450

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

486

451

int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);

487

452

int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);

488

453

int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;

489

int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);

490

491

// Computing sum and std

454

455

float *sumbuf = ps->cuda_points + icp;

456

float *stdbuf = ps->cuda_points + ncp_alloc + icp;

457

492

458

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

493

459

494

float *sumbuf = ps->cuda_points + icp;

495

float *stdbuf = ps->cuda_points + ps->ncp_alloc_size + icp;

496

497

460

dim3 stat_grid_dim(side_blocks, cp_blocks, 1);

498

stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

499

stat2<<<cp_blocks1, BLOCK_SIZE_1D>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

461

stat1<<<stat_grid_dim, block_side_cp, 0, stream0>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, cuda_input_buffer, side_alloc2, side_alloc, size);

462

stat2<<<cp_blocks1, BLOCK_SIZE_1D, 0, stream0>>>(sumbuf, stdbuf, stat_buf, stat_buf + side_alloc * CP_BLOCK, size);

500

463

501

464

// Packing input data for FFT

502

465

dim3 input_grid_dim(input_blocks, cp_blocks, 1);

503

466

504

467

if (ps->side_blocks_power < 0) {

505

vecPack<<<input_grid_dim, block_side_cp>>>(

468

vecPack<<<input_grid_dim, block_side_cp, 0, stream0>>>(

506

469

cuda_input_buffer, side_alloc2, side_alloc,

507

470

cuda_data_buffer, alloc_size, fft_real_size,

508

471

size, side_blocks

509

472

);

510

473

} else {

511

vecPackFast<<<input_grid_dim, block_side_cp>>>(

474

vecPackFast<<<input_grid_dim, block_side_cp, 0, stream0>>>(

512

475

cuda_input_buffer, side_alloc2, side_alloc,

513

476

cuda_data_buffer, alloc_size, fft_real_size,

514

477

size, ps->side_blocks_power

515

478

);

516

479

}

517

518

// Performing FFT's

519

cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

520

521

for (int i = 0;i < ncp;i++) {

522

if (banlist[i]) continue;

523

cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);

524

}

525

526

int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);

527

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

528

vecMul<<<complex_grid_dim,block_side_cp>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);

529

530

// First in-place transform for some reason is failing, therefore we

531

// have one alloc_size spacing between starts

480

481

return 0;

482

}

483

484

static inline int fftPostprocessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

485

int half_size = ps->corr_size;

486

int size = 2 * half_size + 1;

487

int size2 = size * size;

488

489

int fft_size = ps->fft_size;

490

int fft_real_size = ps->fft_real_size;

491

492

int ncp_alloc = ps->ncp_alloc_size;

493

int alloc_size = ps->fft_alloc_size;

494

int side_alloc = ps->side_alloc_size;

495

496

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

497

int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);

498

int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);

499

532

500

cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;

533

for (int i = 0;i < ncp;i++) {

534

if (banlist[i]) continue;

535

cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);

536

}

537

538

501

float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

502

503

float *sumbuf = ps->cuda_points + icp;

504

float *stdbuf = ps->cuda_points + ncp_alloc + icp;

539

505

540

506

// Use real size everthere

541

507

// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);

542

// vecCompute<<<compute_grid_dim, block_side_cp>>>(

508

// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(

543

509

// cuda_final_buffer,

544

510

// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),

545

511

// ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

547

513

// alloc_size

548

514

// );

549

515

516

550

517

int fft2_blocks = fft_blocks * fft_blocks * SIDE_BLOCK_SIZE;

551

518

dim3 compute_grid_dim(fft2_blocks, cp_blocks, 1);

552

vecCompute<<<compute_grid_dim, block_side_cp>>>(

519

520

vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(

553

521

cuda_final_buffer, fft_size,

554

522

cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),

555

523

ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

556

524

ps->cuda_denom_cache + icp*alloc_size, stdbuf,

557

525

alloc_size, fft_blocks

558

526

);

527

559

528

560

529

// Looking for maximum

561

530

float *xbuf = sumbuf;

569

538

// Use real size everthere

570

539

// find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size);

571

540

// find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_real_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

572

find_max1<<<result_grid_dim, block_side_cp>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

573

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

574

575

return 0;

576

}

577

578

static inline mxArray *fftGetPoints(TProcessingState *ps) {

541

542

find_max1<<<result_grid_dim, block_side_cp,0,stream0>>>(maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size);

543

find_max2<<<cp_blocks1, BLOCK_SIZE_1D,0,stream0>>>(xbuf, ybuf, maxbuf, posbuf, cuda_final_buffer, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

544

545

return 0;

546

}

547

548

static inline int fftProcessFragment(TProcessingState *ps, int icp, int ncp, cudaStream_t stream0) {

549

int fft_real_size = ps->fft_real_size;

550

551

int alloc_size = ps->fft_alloc_size;

552

553

uint8_t *banlist = ps->banlist + icp;

554

float *cuda_data_buffer = ps->cuda_data_buffer;

555

556

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

557

558

// Performing FFT's

559

cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

560

561

cufftSetStream(ps->cufft_r2c_plan, stream0);

562

cufftSetStream(ps->cufft_c2r_plan, stream0);

563

564

for (int i = 0;i < ncp;i++) {

565

if (banlist[i]) continue;

566

cufftExecR2C(ps->cufft_r2c_plan, cuda_data_buffer + i * alloc_size, cuda_fft_buffer + i * alloc_size);

567

}

568

569

int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);

570

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

571

vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);

572

573

// First in-place transform for some reason is failing, therefore we

574

// have one alloc_size spacing between starts (see cuda_fft_buffer set above)

575

cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;

576

for (int i = 0;i < ncp;i++) {

577

if (banlist[i]) continue;

578

cufftExecC2R(ps->cufft_c2r_plan, cuda_fft_buffer + i * alloc_size, cuda_result_buffer + i * alloc_size);

579

}

580

581

return 0;

582

}

583

584

585

static inline int fftGetCurrentPoints(DICTContext ps) {

586

int ncp = ps->ncp;

587

int ncp_alloc = ps->ncp_alloc_size;

588

int precision = ps->precision;

589

590

float *move_x = ps->points + 6 * ncp_alloc;

591

float *move_y = move_x + ncp_alloc;

592

593

cudaMemcpy2D(

594

move_x, ncp_alloc * sizeof(float),

595

ps->cuda_points, ncp_alloc * sizeof(float),

596

ps->ncp * sizeof(float), 2,

597

cudaMemcpyDeviceToHost

598

);

599

600

float *data_x, *data_y;

601

if (ps->stored) {

602

data_x = ps->res_x;

603

data_y = ps->res_y;

604

} else {

605

data_x = ps->points + 2 * ncp_alloc;

606

data_y = data_x + ncp_alloc;

607

}

608

609

float *res_x, *res_y;

610

if ((ps->res_x)&&(ps->res_y)) {

611

res_x = ps->res_x;

612

res_y = ps->res_y;

613

614

ps->stored = 1;

615

} else {

616

res_x = data_x;

617

res_y = data_y;

618

}

619

579

620

float frac;

580

581

int ncp = ps->ncp;

582

int ncp_alloc = ps->ncp_alloc_size;

583

int precision = ps->precision;

584

621

float *frac_x = ps->points + 4 * ncp_alloc;

622

float *frac_y = frac_x + ncp_alloc;

585

623

uint8_t *banlist = ps->banlist;

586

624

587

float *res_x = (float*)mxGetData(ps->coords);

588

float *res_y = res_x + ncp;

589

590

float *data_x, *data_y;

591

if (ps->stored) {

592

data_x = res_x;

593

data_y = res_y;

594

} else {

595

data_x = ps->points + 2 * ncp_alloc;

596

data_y = data_x + ncp_alloc;

597

}

598

599

600

float *frac_x = ps->points + 4 * ncp_alloc;

601

float *frac_y = frac_x + ncp_alloc;

602

603

float *move_x = ps->points + 6 * ncp_alloc;

604

float *move_y = move_x + ncp_alloc;

605

606

cudaMemcpy2D(

607

move_x, ncp_alloc * sizeof(float),

608

ps->cuda_points, ncp_alloc * sizeof(float),

609

ps->ncp * sizeof(float), 2,

610

cudaMemcpyDeviceToHost

611

);

612

613

625

for (int i = 0;i < ncp;i++) {

614

if (banlist[i]) {

615

res_x[i] = data_x[i];

616

res_y[i] = data_y[i];

617

continue;

618

}

619

620

frac = data_x[i] - round(data_x[i]*precision)/precision;

621

res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

622

623

frac = data_y[i] - round(data_y[i]*precision)/precision;

624

res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);

625

}

626

627

ps->stored = 1;

628

629

#ifdef USE_UNDOCUMENTED

630

return mxCreateSharedDataCopy(ps->coords);

631

// mxArray *mxCreateSharedDataCopy(const mxArray *pr);

632

// bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy); // true if not successful

633

// mxArray *mxUnreference(const mxArray *pr);

634

#else /* USE_UNDOCUMENTED */

635

return mxDuplicateArray(ps->coords);

636

#endif /* USE_UNDOCUMENTED */

637

}

638

639

640

#ifdef VALIDATE_LSUM

641

static inline int fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {

642

uint8_t *dataPtr;

643

644

if (!ps->fft_initialized) {

645

reportError("cuFFT engine is not initialized yet");

646

return NULL;

647

}

648

649

int size = ps->fft_size;

650

int alloc_size = ps->fft_alloc_size;

651

652

int N = mxGetM(data);

653

int N2 = N * N;

654

655

int side_alloc = ps->side_alloc_size;

656

int side_alloc2 = side_alloc * side_alloc;

657

658

dim3 input_block_dim(N, 1, 1);

659

dim3 input_grid_dim(N, 1, 1);

660

661

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;

662

cufftReal *cudaRealPtr = ps->cuda_base_buffer;

663

664

dataPtr = (uint8_t*)mxGetData(data);

665

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

666

667

float *lsum_temp = ps->cuda_lsum_temp;

668

int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

669

670

cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));

671

cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

672

673

vecPackBase<<<input_grid_dim, input_block_dim>>>(

674

cudaInputPtr, N,

675

cudaRealPtr, size,

676

lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size

677

);

678

679

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

680

local_sum(ps,

681

ps->cuda_lsum_cache + icp * alloc_size, ps->cuda_denom_cache + icp * alloc_size,

682

lsum_temp + (2 * step), lsum_temp + (3 * step),

683

lsum_temp, lsum_temp + step);

684

685

686

We don't really want to compute here

687

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_cache + icp * alloc_size);

688

626

if (banlist[i]) {

627

res_x[i] = data_x[i];

628

res_y[i] = data_y[i];

629

continue;

630

}

631

632

frac = data_x[i] - round(data_x[i]*precision)/precision;

633

res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

634

635

frac = data_y[i] - round(data_y[i]*precision)/precision;

636

res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);

637

}

689

638

690

639

return 0;

691

640

}

692

#endif /* VALIDATE_LSUM */

693

694

#ifdef VALIDATE_PEAK

695

static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {

696

int size = ps->fft_size;

697

int size2 = size * size;

698

int alloc_size = ps->fft_alloc_size;

699

700

fftLoadFragment(ps, icp, 1, image);

701

702

mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

703

704

float *ar = (float*)mxGetPr(res);

705

706

cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;

707

float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * alloc_size;

708

cudaMemcpy(ar, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

709

710

return res;

711

}

712

713

static inline mxArray *fftGetCorrections(TProcessingState *ps) {

714

int ncp = ps->ncp;

715

int ncp_alloc = ps->ncp_alloc_size;

716

717

float *move_x = ps->points + 6 * ncp_alloc;

718

float *move_y = move_x + ncp_alloc;

719

720

cudaMemcpy2D(

721

move_x, ncp_alloc * sizeof(float),

722

ps->cuda_points, ncp_alloc * sizeof(float),

723

ps->ncp * sizeof(float), 2,

724

cudaMemcpyDeviceToHost

725

);

726

727

mxArray *res = mxCreateNumericMatrix(ncp, 2, mxSINGLE_CLASS, mxREAL);

728

float *res_x = (float*)mxGetData(res);

729

float *res_y = res_x + ncp;

730

731

memcpy(res_x, move_x, ncp * sizeof(float));

732

memcpy(res_y, move_y, ncp * sizeof(float));

733

734

return res;

735

}

736

#endif /* VALIDATE_PEAK */

737

738

739

static void selfClean() {

740

if (pstate) {

741

reportMessage("Self-cleaning normxcorr_hw instance");

742

743

pstateFree(pstate);

744

pstate = NULL;

745

}

746

}

747

748

749

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

750

int err;

751

int64_t *errPtr;

752

753

int deviceCount;

754

cudaDeviceProp deviceProp;

755

756

mxArray *idMatrix;

757

int32_t id, *idPtr;

758

759

TProcessingState *ps;

760

761

TAction action;

762

763

int iprop;

764

765

const mxArray *input;

766

const mxArray *base;

767

768

#ifdef VALIDATE_LSUM

769

const mxArray *lsum;

770

const mxArray *denom;

771

const mxArray *nonzero;

772

#endif /* VALIDATE_LSUM */

773

774

const mxArray *x, *y;

775

776

unsigned int icp;

777

778

int optimize;

779

int width, height;

780

int size, size2;

781

int base_size, base_size2;

782

int base_blocks, side_blocks;

783

784

if (!nrhs) {

785

reportMessage("Initializing normxcorr_hw instance");

786

787

if (nlhs != 1) {

788

reportError("You should accept a single result from initialization call");

789

return;

790

}

791

792

if (pstate) {

793

reportError("Only a single calculation process is supported at the moment");

794

return;

795

}

796

797

// Initialising, for now a single client is supported only

798

799

idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);

800

if (!idMatrix) {

801

reportError("Initialization is failed");

802

return;

803

}

804

805

806

// Detecting cuda devices

807

808

cudaGetDeviceCount(&deviceCount);

809

if (deviceCount) {

810

cudaGetDeviceProperties(&deviceProp, 0);

811

if ((deviceProp.major > 1)||((deviceProp.major == 1)&&(deviceProp.minor > 2))) {

812

id = 1;

813

} else { // Hardware capabilities are bellow 1.3

814

id = 0;

815

}

816

} else { // No cuda device, using software

817

id = -1;

818

}

819

820

821

if (id > 0) {

822

pstate = pstateInit();

823

if (!pstate) {

824

mxDestroyArray(idMatrix);

825

reportError("State structure initialization is failed");

826

return;

827

}

828

} else {

829

pstate = NULL;

830

}

831

832

833

idPtr = (int32_t*)mxGetData(idMatrix);

834

idPtr[0] = id;

835

836

plhs[0] = idMatrix;

837

838

mexAtExit(selfClean);

839

840

return;

841

} else {

842

if (!pstate) {

843

reportError("Normxcorr_hw should be initialized first");

844

return;

845

}

846

847

848

idMatrix = (mxArray*)prhs[0];

849

if ((mxGetClassID(idMatrix) != mxINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {

850

reportError("Invalid parameter is supplied in place of process identificator");

851

return;

852

}

853

854

idPtr = (int32_t*)mxGetData(idMatrix);

855

if (!idPtr) {

856

reportError("Mex is not able to obtain process identificator");

857

return;

858

}

859

860

id = *idPtr;

861

if (id != 1) {

862

reportError("Invalid process identificator is supplied");

863

return;

864

}

865

866

if (!pstate) {

867

reportError("The interface is not initialized");

868

return;

869

}

870

871

}

872

873

874

// Clean request

875

if (nrhs == 1) {

876

reportMessage("Cleaning normxcorr_hw instance");

877

878

pstateFree(pstate);

879

pstate = NULL;

880

881

return;

882

}

883

884

885

ps = pstate;

886

887

action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));

888

889

// reportMessage("Executing normxcorr_hw action: %u", action);

890

891

switch (action) {

892

#ifdef VALIDATE_PEAK

893

case ACTION_COMPUTE_FRAGMENT:

894

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

895

plhs[0] = fftCompute(ps, icp, prhs[3]);

896

//fftGetCorrections(TProcessingState *ps, mxArray *result)

897

break;

898

case ACTION_GET_CORRECTIONS:

899

plhs[0] = fftGetCorrections(ps);

900

break;

901

#endif /* VALIDATE_PEAK */

902

#ifdef VALIDATE_LSUM

903

case ACTION_COMPUTE_BASE_FRAGMENT:

904

if ((nrhs != 4)&&(nrhs != 7)) {

905

reportError("ComputeBaseFragment action expects 2 arguments, but %i is passed", nrhs - 2);

906

return;

907

}

908

909

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

910

if (icp >= ps->ncp) {

911

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

912

return;

913

}

914

915

base = prhs[3];

916

917

if (mxGetNumberOfDimensions(base) != 2) {

918

reportError("Invalid dimensionality of base matrix, 2D matrix is expected");

919

return;

920

}

921

922

if (mxGetClassID(base) != mxUINT8_CLASS) {

923

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));

924

return;

925

}

926

927

if (nrhs == 7) {

928

iprop = ps->fft_size;

929

930

lsum = prhs[4];

931

denom = prhs[5];

932

nonzero = prhs[6];

933

if (

934

(mxGetNumberOfDimensions(lsum) != 2)||

935

(mxGetNumberOfDimensions(denom) != 2)||

936

(mxGetClassID(lsum) != mxSINGLE_CLASS)||

937

(mxGetClassID(denom) != mxSINGLE_CLASS)||

938

(mxGetClassID(nonzero) != mxUINT16_CLASS)||

939

(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||

940

(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)

941

942

) {

943

reportError("Invalid properties for base initialization are specified");

944

return;

945

}

946

} else {

947

lsum = NULL;

948

denom = NULL;

949

nonzero = NULL;

950

}

951

952

fftUploadBaseData(ps, icp, base);

953

local_sum_validate(ps, icp, lsum, denom);

954

break;

955

#endif /* VALIDATE_LSUM */

956

case ACTION_COMPUTE:

957

if (nrhs != 3) {

958

reportError("Compute action expects 1 argument, but %i is passed", nrhs - 2);

959

return;

960

}

961

962

input = prhs[2];

963

964

if (mxGetClassID(input) != mxUINT8_CLASS) {

965

reportError("Invalid type of image data, should be 8bit integers");

966

return;

967

}

968

969

for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

970

err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);

971

if (err) break;

972

}

973

974

break;

975

case ACTION_COMPUTE_BASE:

976

if (nrhs != 3) {

977

reportError("ComputeBase action expects 1 argument, but %i is passed", nrhs - 2);

978

return;

979

}

980

981

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

982

if (icp >= ps->ncp) {

983

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

984

return;

985

}

986

987

base = prhs[2];

988

989

if (mxGetNumberOfDimensions(base) != 2) {

990

reportError("Invalid dimensionality of base matrix, 2D matrix is expected");

991

return;

992

}

993

994

if (mxGetClassID(base) != mxUINT8_CLASS) {

995

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));

996

return;

997

}

998

999

width = mxGetN(base);

1000

height = mxGetM(base);

1001

1002

size = 2 * ps->corr_size + 1;

1003

size2 = size * size;

1004

1005

base_size = 4 * ps->corr_size + 1;

1006

base_size2 = base_size * base_size;

1007

1008

if (width * height > ps->ncp * size2) {

1009

ps->mode = 0;

1010

} else {

1011

ps->mode = 1;

1012

}

1013

1014

// if not enoguh space for caching enable anyway ?

1015

if (width * height > ps->ncp * base_size2) {

1016

ps->base_mode = 0;

1017

} else {

1018

ps->base_mode = 1;

1019

if (!ps->mode) {

1020

ps->minx = 0;

1021

ps->maxx = width - 1;

1022

ps->miny = 0;

1023

ps->maxy = height - 1;

1024

}

1025

}

1026

1027

for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

1028

err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), base);

1029

if (err) break;

1030

}

1031

1032

if ((ps->base_mode)&&(!ps->mode)) {

1033

// printf("%ux%u\n", width, height);

1034

1035

// Correcting difference of area size between base and data images

1036

ps->minx += ps->corr_size;

1037

ps->miny += ps->corr_size;

1038

ps->maxx -= ps->corr_size;

1039

ps->maxy -= ps->corr_size;

1040

1041

width = ceil(ps->maxx) - floor(ps->minx);

1042

height = ceil(ps->maxy) - floor(ps->miny);

1043

1044

// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);

1045

if (width * height < ps->ncp * size2) {

1046

ps->mode = 1;

1047

}

1048

}

1049

1050

if (ps->mode) {

1051

reportMessage("Running in the image mode");

1052

} else {

1053

reportMessage("Running in the fragment mode");

1054

}

1055

break;

1056

case ACTION_SET_BASE_POINTS:

1057

if (nrhs != 4) {

1058

reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");

1059

return;

1060

}

1061

1062

x = prhs[2];

1063

y = prhs[3];

1064

1065

if ( (mxGetClassID(x) != mxSINGLE_CLASS)||

1066

(mxGetClassID(y) != mxSINGLE_CLASS)||

1067

(mxGetN(x)*mxGetM(x) != ps->ncp)||

1068

(mxGetN(y)*mxGetM(y) != ps->ncp)

1069

) {

1070

reportError("Invalid control points are specified");

1071

return;

1072

}

1073

1074

memcpy(ps->points, mxGetData(x), ps->ncp * sizeof(float));

1075

memcpy(ps->points + ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));

1076

break;

1077

case ACTION_SET_POINTS:

1078

if (nrhs != 4) {

1079

reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");

1080

return;

1081

}

1082

1083

x = prhs[2];

1084

y = prhs[3];

1085

1086

if ( (mxGetClassID(x) != mxSINGLE_CLASS)||

1087

(mxGetClassID(y) != mxSINGLE_CLASS)||

1088

(mxGetN(x)*mxGetM(x) != ps->ncp)||

1089

(mxGetN(y)*mxGetM(y) != ps->ncp)

1090

) {

1091

reportError("Invalid control points are specified");

1092

return;

1093

}

1094

1095

memcpy(ps->points + 2 * ps->ncp_alloc_size, mxGetData(x), ps->ncp * sizeof(float));

1096

memcpy(ps->points + 3 * ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));

1097

1098

ps->stored = 0;

1099

break;

1100

case ACTION_GET_POINTS:

1101

if (nrhs != 2) {

1102

reportError("GetPoints action do not expect any arguments");

1103

return;

1104

}

1105

if (nlhs != 1) {

1106

reportError("GetPoints action returns a single matrix");

1107

return;

1108

}

1109

1110

plhs[0] = fftGetPoints(ps);

1111

break;

1112

case ACTION_SETUP:

1113

if (nrhs != 6) {

1114

reportError("SETUP action expects 'ncp', 'corrsize', 'precision', and 'optimization level' parameters");

1115

return;

1116

}

1117

1118

fftFree(ps);

1119

1120

ps->ncp = (int)mxGetScalar(prhs[2]);

1121

1122

ps->precision = (int)mxGetScalar(prhs[4]);

1123

1124

optimize = (int)mxGetScalar(prhs[5]);

1125

1126

iprop = (int)mxGetScalar(prhs[3]);

1127

ps->corr_size = iprop;

1128

ps->subimage_size = ps->corr_size * 4 + 1;

1129

ps->fft_size = 6 * iprop + 1;

1130

1131

if (optimize > 3) {

1132

ps->fft_real_size = next_power(ps->fft_size);

1133

} else {

1134

ps->fft_real_size = ps->fft_size;

1135

}

1136

1137

ps->ncp_alloc_size = calc_alloc(ps->ncp, CP_BLOCK);

1138

ps->side_alloc_size = calc_alloc(ps->fft_size, SIDE_BLOCK_SIZE);

1139

1140

// ps->fft_alloc_size = calc_alloc(ps->fft_size * ps->fft_size, BLOCK_SIZE_1D);

1141

ps->fft_alloc_size = calc_alloc(ps->fft_real_size * ps->fft_real_size, BLOCK_SIZE_1D);

1142

1143

ps->lsum_size = ps->corr_size * 2 + 1;

1144

ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;

1145

1146

ps->lsum_short_aligned_size = calc_alloc(ps->fft_size, BLOCK_SIZE_2D);

1147

ps->lsum_aligned_size = calc_alloc(ps->lsum_temp_size, BLOCK_SIZE_2D);

1148

ps->lsum_alloc_size = calc_alloc(ps->lsum_temp_size + ps->lsum_size, BLOCK_SIZE_2D);

1149

1150

err = fftInit(ps);

1151

1152

if (nlhs == 1) {

1153

idMatrix = mxCreateNumericMatrix(1, 1, mxINT64_CLASS, mxREAL);

1154

if (idMatrix) {

1155

errPtr = (int64_t*)mxGetData(idMatrix);

1156

errPtr[0] = err;

1157

plhs[0] = idMatrix;

1158

} else {

1159

reportError("Initialization of result matrix is failed");

1160

return;

1161

}

1162

}

1163

1164

side_blocks = calc_blocks(2 * ps->corr_size + 1, SIDE_BLOCK_SIZE);

1165

if ((side_blocks&(side_blocks-1))) {

1166

ps->side_blocks_power = -1;

1167

} else {

1168

ps->side_blocks_power = debruijn[((uint32_t)side_blocks * 0x077CB531) >> 27];

1169

}

1170

1171

base_blocks = calc_blocks(4 * ps->corr_size + 1, BLOCK_SIZE_1D);

1172

if ((base_blocks&(base_blocks-1))) {

1173

ps->base_blocks_power = -1;

1174

} else {

1175

ps->base_blocks_power = debruijn[((uint32_t)base_blocks * 0x077CB531) >> 27];

1176

}

1177

break;

1178

case ACTION_PREPARE:

1179

fftPrepare(ps);

1180

break;

1181

default:

1182

reportError("Unknown request %i", action);

1183

}

1184

}

Older »