/normxcorr/trunk : revision 8

2

#include <stdio.h>

3

#include <stdlib.h>

4

5

#include <mex.h>

6

7

5

#include "normxcorr_hw.h"

8

6

#include "local_sum.h"

9

7

19

17

static TProcessingState *pstate = NULL;

20

18

21

19

static void fftFree(TProcessingState *ps) {

22

if (ps->cuda_base_buffer) {

20

if (ps->cuda_fft_buffer) {

21

if (ps->coords) mxDestroyArray(ps->coords);

22

23

if (ps->banlist) free(ps->banlist);

24

23

25

cudaFree(ps->cuda_lsum_temp);

24

26

25

27

cudaFree(ps->cuda_lsum_buffer);

26

28

cudaFree(ps->cuda_denom_buffer);

29

cudaFree(ps->cuda_fft_buffer);

27

30

28

cudaFree(ps->cuda_temp_buffer);

31

cudaFree(ps->cuda_data_buffer);

32

cudaFree(ps->cuda_base_buffer);

33

29

34

cudaFree(ps->cuda_final_buffer);

30

35

cudaFree(ps->cuda_result_buffer);

31

cudaFree(ps->cuda_data_buffer);

32

cudaFree(ps->cuda_base_buffer);

36

cudaFree(ps->cuda_temp_buffer);

33

37

cudaFree(ps->cuda_input_buffer);

34

38

cudaFreeHost(ps->input_buffer);

35

39

36

40

cudaFree(ps->cuda_cp);

37

41

38

cudaFreeHost(ps->data_x);

39

cudaFreeHost(ps->data_y);

40

41

ps->cuda_base_buffer = NULL;

42

cudaFreeHost(ps->points);

42

43

}

43

44

45

// DS: Source of bug, that occasionaly can corrupt something ...

45

46

if (ps->cudpp_initialized) {

46

47

cudppDestroyPlan(ps->cudpp_plan);

47

ps->cudpp_initialized = false;

48

}

49

50

if (ps->fft_initialized) {

51

cufftDestroy(ps->cufft_r2c_plan);

52

cufftDestroy(ps->cufft_c2r_plan);

53

// cufftDestroy(ps->cufft_plan);

54

// cublasShutdown();

55

ps->fft_initialized = false;

56

53

}

54

55

memset(ps, 0, sizeof(TProcessingState));

57

56

58

57

}

59

58

69

68

int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;

70

69

int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;

71

70

72

fftFree(ps);

73

74

// cublasInit();

75

// cufftPlan2d(&ps->cufft_plan, ps->fft_size, ps->fft_size, CUFFT_C2C);

76

71

77

72

cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);

78

73

if (cufft_err) {

103

98

104

99

ps->cudpp_initialized = true;

105

100

106

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

101

cuda_err = cudaMalloc((void**)&ps->cuda_fft_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

107

102

if (cuda_err) {

108

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_base_buffer is failed", ps->ncp, ps->fft_alloc_size);

103

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_buffer is failed", ps->ncp, ps->fft_alloc_size);

109

104

fftFree(ps);

110

105

return ERROR_CUDA_MALLOC;

111

106

}

117

112

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */

118

113

);

119

114

120

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, size);

115

cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);

121

116

if (cuda_err) {

122

reportError("Device memory allocation of %u*cufftComplex bytes for cuda_data_buffer is failed", ps->fft_alloc_size);

117

reportError("Device memory allocation of %u*cufftComplex bytes for cuda_temp_buffer is failed", ps->fft_alloc_size);

123

118

fftFree(ps);

124

119

return ERROR_CUDA_MALLOC;

125

120

}

126

127

cuda_err = cudaHostAlloc((void**)&ps->data_x, ps->ncp * sizeof(float), 0);

128

if (!cuda_err) cuda_err = cudaHostAlloc((void**)&ps->data_y, ps->ncp * sizeof(float), 0);

121

122

ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));

123

if (!ps->banlist) {

124

reportError("Host memory allocation of %u*float bytes for banlist of control points is failed", ps->ncp);

125

fftFree(ps);

126

return ERROR_MALLOC;

127

}

128

memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));

129

130

cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);

129

131

if (cuda_err) {

130

reportError("Host memory allocation of 2*%u*float bytes for control points is failed", ps->ncp);

132

reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);

131

133

fftFree(ps);

132

134

return ERROR_CUDA_MALLOC;

133

135

}

168

170

return ERROR_CUDA_MALLOC;

169

171

}

170

172

171

cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, ps->fft_alloc_size * sizeof(cufftReal));

172

if (cuda_err) {

173

reportError("Device memory allocation of %u*cufftReal bytes for cuda_temp_buffer is failed", ps->fft_alloc_size);

174

fftFree(ps);

175

return ERROR_CUDA_MALLOC;

176

}

177

cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

173

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->fft_alloc_size * sizeof(cufftReal));

174

if (cuda_err) {

175

reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);

176

fftFree(ps);

177

return ERROR_CUDA_MALLOC;

178

}

179

cudaMemset((void*)ps->cuda_base_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

180

181

cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, ps->fft_alloc_size * sizeof(cufftReal));

182

if (cuda_err) {

183

reportError("Device memory allocation of %u*cufftReal bytes for cuda_data_buffer is failed", ps->fft_alloc_size);

184

fftFree(ps);

185

return ERROR_CUDA_MALLOC;

186

}

187

cudaMemset((void*)ps->cuda_data_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

178

188

179

189

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

180

190

if (cuda_err) {

200

210

// cumsum of complete matrix, but non-zero part of it

201

211

cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));

202

212

213

ps->coords = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);

214

if (ps->coords) {

215

mexMakeArrayPersistent(ps->coords);

216

} else {

217

reportError("Allocation of Matlab matrix of size %u*float bytes is failed", ps->ncp);

218

fftFree(ps);

219

return ERROR_MALLOC;

220

}

221

222

//mexMakeMemoryPersistent(ps->coords);

223

//mexLock() mexUnlock()

203

224

204

225

return 0;

205

226

}

206

227

207

228

static void fftPrepare(TProcessingState *ps) {

229

/*

208

230

if (ps->fft_initialized) {

209

231

// Since template and current image have different neighbourhoud sizes

210

cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

232

cudaMemset((void*)ps->cuda_data_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

211

233

}

234

*/

212

235

}

213

236

214

237

224

247

TProcessingState *ps;

225

248

226

249

ps = (TProcessingState*)malloc(sizeof(TProcessingState));

227

if (ps) {

228

ps->ncp = 0;

229

ps->cuda_base_buffer = NULL;

230

ps->fft_initialized = false;

231

ps->cudpp_initialized = false;

232

}

233

250

if (ps) memset(ps, 0, sizeof(TProcessingState));

234

251

return ps;

235

252

}

236

237

238

static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {

239

uint8_t *dataPtr;

240

241

if (!ps->fft_initialized) {

242

reportError("cuFFT engine is not initialized yet");

243

return NULL;

244

}

245

246

int size = ps->fft_size;

253

/*

254

static inline int fftCalibrate(TProcessingState *ps, const mxArray *image) {

255

int width = mxGetN(image);

256

int height = mxGetM(image);

257

258

int size = 2 * ps->corr_size + 1;

259

int size2 = size * size;

260

261

int base_size = 4 * ps->corr_size + 1;

262

int base_size2 = base_size * base_size;

263

264

// printf("%u %u %u\n", width*height, ps->ncp*size2, ps->ncp*base_size2);

265

266

if (width * height > ps->ncp * size2) {

267

ps->mode = 0;

268

} else {

269

ps->mode = 1;

270

}

271

272

// if not enoguh space for caching enable anyway ?

273

if (width * height > ps->ncp * base_size2) {

274

ps->base_mode = 0;

275

} else {

276

ps->base_mode = 1;

277

if (!ps->mode) {

278

ps->minx = 0;

279

ps->maxx = width - 1;

280

ps->miny = 0;

281

ps->maxy = height - 1;

282

}

283

}

284

285

return 0;

286

}

287

*/

288

289

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

290

int width = mxGetN(image);

291

int height = mxGetM(image);

292

293

int check_mode = ((ps->base_mode)&&(!ps->mode));

294

float minx, miny, maxx, maxy;

295

296

int precision = ps->precision;

297

298

int half_size = 2 * ps->corr_size;

299

int size = 2 * half_size + 1;

300

301

int fft_size = ps->fft_size;

302

303

int ncp_alloc = ps->ncp_alloc_size;

247

304

int alloc_size = ps->fft_alloc_size;

248

249

int N = mxGetM(data);

250

int N2 = N * N;

251

252

305

int side_alloc = ps->side_alloc_size;

253

306

int side_alloc2 = side_alloc * side_alloc;

254

307

255

dim3 input_block_dim(N, 1, 1);

256

dim3 input_grid_dim(N, 1, 1);

257

258

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;

259

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

260

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

261

262

dataPtr = (uint8_t*)mxGetData(data);

263

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

308

uint8_t *banlist = ps->banlist + icp;

309

310

float *data_x = ps->points + icp;

311

float *data_y = data_x + ncp_alloc;

312

313

float *frac_x = ps->points + 4 * ncp_alloc + icp;

314

float *frac_y = frac_x + ncp_alloc;

315

316

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

317

uint8_t *img = ps->input_buffer;

264

318

265

319

float *lsum_temp = ps->cuda_lsum_temp;

266

int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

267

268

cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));

269

cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

270

271

vecPackBase<<<input_grid_dim, input_block_dim>>>(

272

cudaInputPtr, N,

273

cudaRealPtr, size,

274

lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size

275

);

320

int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

321

322

dim3 input_block_dim(size, 1, 1);

323

dim3 input_grid_dim(size, 1, 1);

324

325

cufftReal *cudaRealPtr = ps->cuda_base_buffer;

326

327

if (check_mode) {

328

minx = ps->minx;

329

maxx = ps->maxx;

330

miny = ps->miny;

331

maxy = ps->maxy;

332

}

333

334

for (int i = 0;i < ncp;i++) {

335

float x = data_x[i] - 1;

336

float y = data_y[i] - 1;

337

338

frac_x[i] = x - round(x * precision) / precision;

339

frac_y[i] = y - round(y * precision) / precision;

340

341

int xstart = roundf(x) - half_size;

342

int ystart = roundf(y) - half_size;

343

344

int xend = xstart + size;

345

int yend = xstart + size;

346

347

if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

348

continue;

349

}

350

351

if (check_mode) {

352

if (xstart < minx) minx = xstart;

353

if (ystart < miny) miny = ystart;

354

if (xend > maxx) maxx = xend;

355

if (yend > maxy) maxy = yend;

356

}

357

358

cudaMemcpy2D(

359

img,

360

size * sizeof(uint8_t),

361

fullimg + (xstart * height + ystart),

362

height * sizeof(uint8_t),

363

size * sizeof(uint8_t),

364

size,

365

cudaMemcpyHostToHost

366

);

367

368

// Somehow check for constancy

369

// sum(sub_base(:)) == sub_base(1)*numel(sub_base)

370

// The values of TEMPLATE cannot all be the same

371

372

uint8_t *cudaInputPtr = ps->cuda_input_buffer + (icp + i) * side_alloc2;

373

cufftComplex *cudaPtr = ps->cuda_fft_buffer + (icp + i) * alloc_size;

374

375

cudaMemcpy2D(

376

cudaInputPtr, side_alloc * sizeof(uint8_t),

377

img, size * sizeof(uint8_t),

378

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

379

);

380

381

cudaMemset((void*)(ps->cuda_lsum_temp + 2*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));

382

cudaMemset((void*)(ps->cuda_lsum_temp + 3*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));

383

384

vecPackBase<<<input_grid_dim, input_block_dim>>>(

385

cudaInputPtr, side_alloc,

386

cudaRealPtr, fft_size,

387

lsum_temp, lsum_temp + lsum_step, ps->lsum_alloc_size, ps->lsum_size

388

);

276

389

277

390

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

278

local_sum(ps,

279

ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,

280

lsum_temp + (2 * step), lsum_temp + (3 * step),

281

lsum_temp, lsum_temp + step);

282

283

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

284

285

return cudaPtr;

286

}

287

288

289

static inline void fftGetPoints(TProcessingState *ps, mxArray *result) {

290

cudaMemcpy2D(

291

mxGetData(result), ps->ncp * sizeof(float),

292

ps->cuda_cp, ps->ncp_alloc_size * sizeof(float),

293

ps->ncp * sizeof(float), 2,

294

cudaMemcpyDeviceToHost

295

);

296

}

297

298

/*

299

static inline mxArray *fftSetPoints(TProcessingState *ps, mxArray *result) {

300

}

301

*/

302

303

static int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

391

local_sum(ps,

392

ps->cuda_lsum_buffer + (icp + i) * alloc_size, ps->cuda_denom_buffer + (icp + i) * alloc_size,

393

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

394

lsum_temp, lsum_temp + lsum_step);

395

396

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

397

398

banlist[i] = 0;

399

}

400

401

if (check_mode) {

402

ps->minx = minx;

403

ps->maxx = maxx;

404

ps->miny = miny;

405

ps->maxy = maxy;

406

}

407

408

409

return 0;

410

}

411

412

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

304

413

int width = mxGetN(image);

305

414

int height = mxGetM(image);

306

415

310

419

311

420

int fft_size = ps->fft_size;

312

421

int fft_size2 = fft_size * fft_size;

422

423

int ncp_alloc = ps->ncp_alloc_size;

313

424

int alloc_size = ps->fft_alloc_size;

314

425

int side_alloc = ps->side_alloc_size;

315

426

int side_alloc2 = side_alloc * side_alloc;

427

428

float *data_x, *data_y;

429

if (ps->stored) {

430

data_x = ((float*)mxGetData(ps->coords)) + icp;

431

data_y = data_x + ps->ncp;

432

} else {

433

data_x = ps->points + 2 * ncp_alloc + icp;

434

data_y = data_x + ncp_alloc;

435

}

316

436

317

437

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

318

438

uint8_t *img = ps->input_buffer;

439

440

uint8_t *banlist = ps->banlist + icp;

319

441

320

cufftComplex *cudaDataPtr = (cufftComplex*)ps->cuda_data_buffer;

321

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

442

cufftComplex *cudaDataPtr = (cufftComplex*)ps->cuda_temp_buffer;

443

cufftReal *cudaRealPtr = ps->cuda_data_buffer;

322

444

323

445

dim3 input_block_dim(size, 1, 1);

324

446

dim3 input_grid_dim(size, 1, 1);

326

448

dim3 grid_dim(fft_size, 1, 1);

327

449

328

450

for (int i = 0;i < ncp;i++) {

329

float x = ps->data_x[i+icp] - 1;

330

float y = ps->data_y[i+icp] - 1;

451

float x = data_x[i] - 1;

452

float y = data_y[i] - 1;

331

453

332

454

int xstart = roundf(x) - half_size;

333

455

int ystart = roundf(y) - half_size;

335

457

int xend = xstart + size;

336

458

int yend = xstart + size;

337

459

338

if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

339

// Somehow mark we have skipped it

460

if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

461

banlist[i] = 1;

340

462

continue;

341

463

}

342

464

351

473

);

352

474

353

475

354

cufftComplex *cudaBasePtr = ps->cuda_base_buffer + (i+icp) * alloc_size;

476

cufftComplex *cudaBasePtr = ps->cuda_fft_buffer + (i+icp) * alloc_size;

355

477

cufftReal *cudaResultPtr = ps->cuda_result_buffer + (i+icp) * alloc_size;

356

478

357

479

uint8_t *cudaInputPtr = ps->cuda_input_buffer + i*side_alloc2;

362

484

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

363

485

);

364

486

365

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, fft_size, cudaInputPtr, side_alloc, size);

487

vecPack<<<input_grid_dim, input_block_dim>>>(cudaInputPtr, side_alloc, cudaRealPtr, fft_size, size);

366

488

367

489

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);

368

490

379

501

if (size%SIDE_BLOCK_SIZE) side_blocks = (size / SIDE_BLOCK_SIZE) + 1;

380

502

else side_blocks = size / SIDE_BLOCK_SIZE;

381

503

382

int32_t *stat_buf = (int*)ps->cuda_data_buffer;

504

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

383

505

384

506

float *sumbuf = ps->cuda_cp + icp;

385

507

float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;

417

539

float *xbuf = sumbuf;

418

540

float *ybuf = stdbuf;

419

541

420

int32_t *posbuf = (int*)ps->cuda_data_buffer;

542

int32_t *posbuf = (int*)ps->cuda_temp_buffer;

421

543

float *maxbuf = (float*)(posbuf + CP_BLOCK*side_alloc);

422

544

423

545

int fft_blocks = calc_alloc(fft_size, BLOCK_SIZE_2D);

427

549

find_max1<<<result_grid_dim, result_block_dim>>>(maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size);

428

550

find_max2<<<cp_blocks1, BLOCK_SIZE_1D>>>(xbuf, ybuf, maxbuf, posbuf, ps->cuda_final_buffer + icp*alloc_size, alloc_size, fft_size, fft_size, 3 * ps->corr_size + 1, ps->corr_size - 1);

429

551

430

431

return 0;

432

}

552

return 0;

553

}

554

555

static inline mxArray *fftGetPoints(TProcessingState *ps) {

556

float frac;

557

558

int ncp = ps->ncp;

559

int ncp_alloc = ps->ncp_alloc_size;

560

int precision = ps->precision;

561

562

uint8_t *banlist = ps->banlist;

563

564

float *res_x = (float*)mxGetData(ps->coords);

565

float *res_y = res_x + ncp;

566

567

float *data_x, *data_y;

568

if (ps->stored) {

569

data_x = res_x;

570

data_y = res_y;

571

} else {

572

data_x = ps->points + 2 * ncp_alloc;

573

data_y = data_x + ncp_alloc;

574

}

575

576

577

float *frac_x = ps->points + 4 * ncp_alloc;

578

float *frac_y = frac_x + ncp_alloc;

579

580

float *move_x = ps->points + 6 * ncp_alloc;

581

float *move_y = move_x + ncp_alloc;

582

583

cudaMemcpy2D(

584

move_x, ncp_alloc * sizeof(float),

585

ps->cuda_cp, ncp_alloc * sizeof(float),

586

ps->ncp * sizeof(float), 2,

587

cudaMemcpyDeviceToHost

588

);

589

590

for (int i = 0;i < ncp;i++) {

591

if (banlist[i]) {

592

res_x[i] = data_x[i];

593

res_y[i] = data_y[i];

594

continue;

595

}

596

597

frac = data_x[i] - round(data_x[i]*precision)/precision;

598

res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

599

600

frac = data_y[i] - round(data_y[i]*precision)/precision;

601

res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);

602

}

603

604

ps->stored = 1;

605

606

#ifdef USE_UNDOCUMENTED

607

return mxCreateSharedDataCopy(ps->coords);

608

// mxArray *mxCreateSharedDataCopy(const mxArray *pr);

609

// bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy); // true if not successful

610

// mxArray *mxUnreference(const mxArray *pr);

611

#else /* USE_UNDOCUMENTED */

612

return mxDuplicateArray(ps->coords);

613

#endif /* USE_UNDOCUMENTED */

614

}

615

616

617

#ifdef VALIDATE_LSUM

618

static inline int fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {

619

uint8_t *dataPtr;

620

621

if (!ps->fft_initialized) {

622

reportError("cuFFT engine is not initialized yet");

623

return NULL;

624

}

625

626

int size = ps->fft_size;

627

int alloc_size = ps->fft_alloc_size;

628

629

int N = mxGetM(data);

630

int N2 = N * N;

631

632

int side_alloc = ps->side_alloc_size;

633

int side_alloc2 = side_alloc * side_alloc;

634

635

dim3 input_block_dim(N, 1, 1);

636

dim3 input_grid_dim(N, 1, 1);

637

638

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * side_alloc2;

639

cufftReal *cudaRealPtr = ps->cuda_base_buffer;

640

641

dataPtr = (uint8_t*)mxGetData(data);

642

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

643

644

float *lsum_temp = ps->cuda_lsum_temp;

645

int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

646

647

cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));

648

cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

649

650

vecPackBase<<<input_grid_dim, input_block_dim>>>(

651

cudaInputPtr, N,

652

cudaRealPtr, size,

653

lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size

654

);

655

656

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

657

local_sum(ps,

658

ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,

659

lsum_temp + (2 * step), lsum_temp + (3 * step),

660

lsum_temp, lsum_temp + step);

661

662

/*

663

We don't really want to compute here

664

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, ps->cuda_fft_buffer + icp * alloc_size);

665

*/

666

667

return 0;

668

}

669

#endif /* VALIDATE_LSUM */

433

670

434

671

#ifdef VALIDATE_PEAK

435

672

static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *image) {

436

673

int size = ps->fft_size;

437

674

int size2 = size * size;

438

675

int alloc_size = ps->fft_alloc_size;

439

float *cudaResultPtr = ps->cuda_final_buffer + icp * alloc_size;

440

676

441

677

fftLoadFragment(ps, icp, 1, image);

442

678

443

679

mxArray *res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

680

444

681

float *ar = (float*)mxGetPr(res);

445

682

446

cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

683

cudaMemcpy(ar, ps->cuda_final_buffer + icp * alloc_size, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

684

685

return res;

686

}

687

688

static inline mxArray *fftGetCorrections(TProcessingState *ps) {

689

int ncp = ps->ncp;

690

int ncp_alloc = ps->ncp_alloc_size;

691

692

float *move_x = ps->points + 6 * ncp_alloc;

693

float *move_y = move_x + ncp_alloc;

694

695

cudaMemcpy2D(

696

move_x, ncp_alloc * sizeof(float),

697

ps->cuda_cp, ncp_alloc * sizeof(float),

698

ps->ncp * sizeof(float), 2,

699

cudaMemcpyDeviceToHost

700

);

701

702

mxArray *res = mxCreateNumericMatrix(ncp, 2, mxSINGLE_CLASS, mxREAL);

703

float *res_x = (float*)mxGetData(res);

704

float *res_y = res_x + ncp;

705

706

memcpy(res_x, move_x, ncp * sizeof(float));

707

memcpy(res_y, move_y, ncp * sizeof(float));

447

708

448

709

return res;

449

710

}

450

711

#endif /* VALIDATE_PEAK */

451

712

452

/*

453

static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {

454

cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);

455

}

456

*/

457

458

713

459

714

static void selfClean() {

460

715

if (pstate) {

492

747

#endif /* VALIDATE_LSUM */

493

748

494

749

const mxArray *x, *y;

495

mxArray *points;

496

750

497

751

unsigned int icp;

498

752

753

int width, height;

754

int size, size2;

755

int base_size, base_size2;

756

499

757

if (!nrhs) {

500

758

reportMessage("Initializing normxcorr_hw instance");

501

759

554

812

555

813

return;

556

814

} else {

815

if (!pstate) {

816

reportError("Normxcorr_hw should be initialized first");

817

return;

818

}

819

557

820

/*

558

821

idMatrix = (mxArray*)prhs[0];

559

822

if ((mxGetClassID(idMatrix) != mxINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {

580

843

*/

581

844

}

582

845

846

583

847

// Clean request

584

848

if (nrhs == 1) {

585

849

reportMessage("Cleaning normxcorr_hw instance");

592

856

593

857

594

858

ps = pstate;

595

859

596

860

action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));

597

861

598

862

// reportMessage("Executing normxcorr_hw action: %u", action);

602

866

case ACTION_COMPUTE_FRAGMENT:

603

867

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

604

868

plhs[0] = fftCompute(ps, icp, prhs[3]);

869

//fftGetCorrections(TProcessingState *ps, mxArray *result)

870

break;

871

case ACTION_GET_CORRECTIONS:

872

plhs[0] = fftGetCorrections(ps);

605

873

break;

606

874

#endif /* VALIDATE_PEAK */

607

case ACTION_COMPUTE:

608

if (nrhs != 3) {

609

reportError("This action expects 1 argument, but %i is passed", nrhs - 2);

610

return;

611

}

612

613

input = prhs[2];

614

615

if (mxGetClassID(input) != mxUINT8_CLASS) {

616

reportError("Invalid type of image data, should be 8bit integers");

617

return;

618

}

619

620

for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

621

err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);

622

}

623

624

break;

625

case ACTION_COMPUTE_BASE:

626

if ((nrhs != 4)

627

875

#ifdef VALIDATE_LSUM

628

&&(nrhs != 7)

629

#endif /* VALIDATE_LSUM */

630

) {

631

reportError("This action expects 2 arguments, but %i is passed", nrhs - 2);

876

case ACTION_COMPUTE_BASE_FRAGMENT:

877

if ((nrhs != 4)&&(nrhs != 7)) {

878

reportError("ComputeBaseFragment action expects 2 arguments, but %i is passed", nrhs - 2);

632

879

return;

633

880

}

634

881

650

897

return;

651

898

}

652

899

653

iprop = ps->fft_size;

654

655

#ifdef VALIDATE_LSUM

656

900

if (nrhs == 7) {

901

iprop = ps->fft_size;

902

657

903

lsum = prhs[4];

658

904

denom = prhs[5];

659

905

nonzero = prhs[6];

675

921

denom = NULL;

676

922

nonzero = NULL;

677

923

}

678

#endif /* VALIDATE_LSUM */

679

924

680

925

fftUploadBaseData(ps, icp, base);

681

682

#ifdef VALIDATE_LSUM

683

926

local_sum_validate(ps, icp, lsum, denom);

927

break;

684

928

#endif /* VALIDATE_LSUM */

685

929

case ACTION_COMPUTE:

930

if (nrhs != 3) {

931

reportError("Compute action expects 1 argument, but %i is passed", nrhs - 2);

932

return;

933

}

934

935

input = prhs[2];

936

937

if (mxGetClassID(input) != mxUINT8_CLASS) {

938

reportError("Invalid type of image data, should be 8bit integers");

939

return;

940

}

941

942

for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

943

err = fftLoadFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), input);

944

if (err) break;

945

}

946

947

break;

948

case ACTION_COMPUTE_BASE:

949

if (nrhs != 3) {

950

reportError("ComputeBase action expects 1 argument, but %i is passed", nrhs - 2);

951

return;

952

}

953

954

icp = (unsigned int)mxGetScalar(prhs[2]) - 1;

955

if (icp >= ps->ncp) {

956

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

957

return;

958

}

959

960

base = prhs[2];

961

962

if (mxGetNumberOfDimensions(base) != 2) {

963

reportError("Invalid dimensionality of base matrix, 2D matrix is expected");

964

return;

965

}

966

967

if (mxGetClassID(base) != mxUINT8_CLASS) {

968

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));

969

return;

970

}

971

972

width = mxGetN(base);

973

height = mxGetM(base);

974

975

size = 2 * ps->corr_size + 1;

976

size2 = size * size;

977

978

base_size = 4 * ps->corr_size + 1;

979

base_size2 = base_size * base_size;

980

981

if (width * height > ps->ncp * size2) {

982

ps->mode = 0;

983

} else {

984

ps->mode = 1;

985

}

986

987

// if not enoguh space for caching enable anyway ?

988

if (width * height > ps->ncp * base_size2) {

989

ps->base_mode = 0;

990

} else {

991

ps->base_mode = 1;

992

if (!ps->mode) {

993

ps->minx = 0;

994

ps->maxx = width - 1;

995

ps->miny = 0;

996

ps->maxy = height - 1;

997

}

998

}

999

1000

for (icp = 0; icp < ps->ncp; icp+=CP_BLOCK) {

1001

err = fftLoadBaseFragment(ps, icp, min2(CP_BLOCK, ps->ncp - icp), base);

1002

if (err) break;

1003

}

1004

1005

if ((ps->base_mode)&&(!ps->mode)) {

1006

// printf("%ux%u\n", width, height);

1007

1008

// Correcting difference of area size between base and data images

1009

ps->minx += ps->corr_size;

1010

ps->miny += ps->corr_size;

1011

ps->maxx -= ps->corr_size;

1012

ps->maxy -= ps->corr_size;

1013

1014

width = ceil(ps->maxx) - floor(ps->minx);

1015

height = ceil(ps->maxy) - floor(ps->miny);

1016

1017

// printf("%ux%u=%u %u\n", width, height, width*height, ps->ncp * size2);

1018

if (width * height < ps->ncp * size2) {

1019

ps->mode = 1;

1020

}

1021

}

1022

1023

if (ps->mode) {

1024

reportMessage("Running in the image mode");

1025

} else {

1026

reportMessage("Running in the fragment mode");

1027

}

1028

break;

1029

case ACTION_SET_BASE_POINTS:

1030

if (nrhs != 4) {

1031

reportError("SET_POINTS action expects two arrays with 'x' and 'y' coordinates of control points");

1032

return;

1033

}

1034

1035

x = prhs[2];

1036

y = prhs[3];

1037

1038

if ( (mxGetClassID(x) != mxSINGLE_CLASS)||

1039

(mxGetClassID(y) != mxSINGLE_CLASS)||

1040

(mxGetN(x)*mxGetM(x) != ps->ncp)||

1041

(mxGetN(y)*mxGetM(y) != ps->ncp)

1042

) {

1043

reportError("Invalid control points are specified");

1044

return;

1045

}

1046

1047

memcpy(ps->points, mxGetData(x), ps->ncp * sizeof(float));

1048

memcpy(ps->points + ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));

686

1049

break;

687

1050

case ACTION_SET_POINTS:

688

1051

if (nrhs != 4) {

701

1064

reportError("Invalid control points are specified");

702

1065

return;

703

1066

}

704

705

memcpy(ps->data_x, mxGetData(x), ps->ncp * sizeof(float));

706

memcpy(ps->data_y, mxGetData(y), ps->ncp * sizeof(float));

1067

1068

memcpy(ps->points + 2 * ps->ncp_alloc_size, mxGetData(x), ps->ncp * sizeof(float));

1069

memcpy(ps->points + 3 * ps->ncp_alloc_size, mxGetData(y), ps->ncp * sizeof(float));

1070

1071

ps->stored = 0;

707

1072

break;

708

1073

case ACTION_GET_POINTS:

709

1074

if (nrhs != 2) {

714

1079

reportError("GetPoints action returns a single matrix");

715

1080

return;

716

1081

}

717

718

points = mxCreateNumericMatrix(ps->ncp, 2, mxSINGLE_CLASS, mxREAL);

719

fftGetPoints(ps, points);

720

plhs[0] = points;

1082

1083

plhs[0] = fftGetPoints(ps);

721

1084

break;

722

1085

case ACTION_SETUP:

723

if (nrhs != 4) {

724

reportError("SETUP action expects 'ncp' and 'corrsize' parameters");

1086

if (nrhs != 5) {

1087

reportError("SETUP action expects 'ncp', 'corrsize', and 'precision' parameters");

725

1088

return;

726

1089

}

727

1090

1091

fftFree(ps);

1092

728

1093

ps->ncp = (int)mxGetScalar(prhs[2]);

729

1094

1095

ps->precision = (int)mxGetScalar(prhs[4]);

1096

730

1097

iprop = (int)mxGetScalar(prhs[3]);

731

1098

ps->corr_size = iprop;

732

1099

ps->fft_size = 6 * iprop + 1;

763

1130

default:

764

1131

reportError("Unknown request %i", action);

765

1132

}

766

767

1133

}