/normxcorr/trunk : revision 4

1

#if defined(_WIN32) || defined(_WIN64)

2

# include <windows.h>

3

typedef UINT8 uint8_t;

4

typedef UINT16 uint16_t;

5

typedef UINT32 uint32_t;

6

typedef INT8 int8_t;

7

typedef INT16 int16_t;

8

typedef INT32 int32_t;

9

#else

10

# include <stdint.h>

11

#endif

12

1

13

2

#include <stdio.h>

14

3

#include <stdlib.h>

15

4

16

#include <cuda.h>

17

#include <cuda_runtime.h>

18

19

#include <cublas.h>

20

#include <cufft.h>

21

22

5

#include <mex.h>

23

6

24

#define BLOCK_SIZE 64

7

#include "normxcorr_hw.h"

8

#include "local_sum.h"

25

9

26

//#include "normxcorr_hw_pack.h"

27

10

#include "normxcorr_hw_msg.h"

28

29

11

#include "normxcorr_hw_kernel.cu"

30

12

31

13

32

typedef enum {

33

ACTION_SETUP = 1,

34

ACTION_PREPARE = 2,

35

ACTION_COMPUTE_BASE = 10,

36

ACTION_COMPUTE_FRAGMENT = 11,

37

} TAction;

38

39

typedef enum {

40

ERROR_CUFFT = 1,

41

ERROR_CUDA_MALLOC = 2,

42

ERROR_MALLOC = 3

43

} TError;

44

45

struct STProcessingState {

46

cufftComplex *cuda_base_buffer; // Stored FFT's of the template image

47

cufftComplex *cuda_data_buffer; // Main computational buffer

48

cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs

49

cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs

50

float *cuda_final_buffer; // Ultimate output

51

uint8_t *cuda_input_buffer; // Input buffer

52

53

float *cuda_lsum_buffer;

54

float *cuda_denom_buffer;

55

56

int *grid_size;

57

uint16_t *cuda_nonzero_items;

58

uint16_t *cuda_nonzero_buffer;

59

60

int ncp; // Number of control points

61

int fft_size; // Matrix Size for FFT (base_size + input_size - 1)

62

int fft_size2; // size * size

63

int fft_alloc_size; // cuda optimized size2

64

int fft_inner_size; // size * (size/2 + 1), R2C/C2R

65

66

int fft_initialized; // Flag indicating if CUFFT plan is initialized

67

cufftHandle cufft_plan;

68

cufftHandle cufft_r2c_plan;

69

cufftHandle cufft_c2r_plan;

70

};

71

72

typedef struct STProcessingState TProcessingState;

73

14

static TProcessingState *pstate = NULL;

74

15

75

76

16

static void fftFree(TProcessingState *ps) {

77

if (ps->fft_initialized) {

78

cufftDestroy(ps->cufft_r2c_plan);

79

cufftDestroy(ps->cufft_c2r_plan);

80

// cufftDestroy(ps->cufft_plan);

81

// cublasShutdown();

82

ps->fft_initialized = false;

83

}

84

85

17

if (ps->cuda_base_buffer) {

18

/*

86

19

free(ps->grid_size);

87

20

free(ps->cuda_nonzero_items);

21

*/

22

23

cudaFree(ps->cuda_lsum_temp);

88

24

89

25

cudaFree(ps->cuda_lsum_buffer);

90

26

cudaFree(ps->cuda_denom_buffer);

27

/*

91

28

cudaFree(ps->cuda_nonzero_buffer);

29

*/

92

30

93

31

cudaFree(ps->cuda_temp_buffer);

94

32

cudaFree(ps->cuda_final_buffer);

99

37

100

38

ps->cuda_base_buffer = NULL;

101

39

}

40

41

// DS: Source of bug, that occasionaly can corrupt something ...

42

if (ps->cudpp_initialized) {

43

cudppDestroyPlan(ps->cudpp_plan);

44

ps->cudpp_initialized = false;

45

}

46

47

if (ps->fft_initialized) {

48

cufftDestroy(ps->cufft_r2c_plan);

49

cufftDestroy(ps->cufft_c2r_plan);

50

// cufftDestroy(ps->cufft_plan);

51

// cublasShutdown();

52

ps->fft_initialized = false;

53

}

54

102

55

}

103

56

104

57

#include <unistd.h>

105

58

static int fftInit(TProcessingState *ps) {

59

CUDPPConfiguration cudpp_config;

60

61

CUDPPResult cudpp_err;

106

62

cufftResult cufft_err;

107

63

cudaError cuda_err;

64

65

int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;

108

66

109

67

fftFree(ps);

110

68

126

84

127

85

ps->fft_initialized = true;

128

86

87

cudpp_config.algorithm = CUDPP_SCAN;

88

cudpp_config.options = CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE;

89

cudpp_config.op = CUDPP_ADD;

90

cudpp_config.datatype = CUDPP_FLOAT;

91

92

cudpp_err = cudppPlan(&ps->cudpp_plan, cudpp_config, ps->lsum_alloc_size, ps->lsum_alloc_size, ps->lsum_alloc_size);

93

if (cudpp_err != CUDPP_SUCCESS) {

94

reportError("Problem initializing CUDPP plan, cudpp code: %i", cudpp_err);

95

fftFree(ps);

96

return ERROR_CUDPP;

97

}

98

99

ps->cudpp_initialized = true;

100

129

101

cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

130

102

if (cuda_err) {

131

103

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_base_buffer is failed", ps->ncp, ps->fft_alloc_size);

170

142

}

171

143

cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

172

144

173

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

145

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

174

146

if (cuda_err) {

175

147

reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_buffer is failed", ps->ncp, ps->fft_alloc_size);

176

148

fftFree(ps);

177

149

return ERROR_CUDA_MALLOC;

178

150

}

179

151

180

cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

152

cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

181

153

if (cuda_err) {

182

154

reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_buffer is failed", ps->ncp, ps->fft_alloc_size);

183

155

fftFree(ps);

184

156

return ERROR_CUDA_MALLOC;

185

157

}

186

158

159

/*

187

160

cuda_err = cudaMalloc((void**)&ps->cuda_nonzero_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

188

161

if (cuda_err) {

189

162

reportError("Device memory allocation of %u*%u*uint16 bytes for cuda_nonzero_buffer is failed", ps->ncp, ps->fft_alloc_size);

191

164

return ERROR_CUDA_MALLOC;

192

165

}

193

166

cudaMemset((void*)ps->cuda_nonzero_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

194

167

195

168

ps->cuda_nonzero_items = (uint16_t*)malloc(ps->ncp * sizeof(uint16_t));

196

169

if (!ps->cuda_nonzero_items) {

197

170

reportError("Host memory allocation of %u*uint16 bytes for cuda_nonzero_items is failed", ps->ncp);

205

178

fftFree(ps);

206

179

return ERROR_MALLOC;

207

180

}

181

*/

182

183

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2 * sizeof(float));

184

if (cuda_err) {

185

reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);

186

fftFree(ps);

187

return ERROR_MALLOC;

188

}

189

// We need to zero temporary buffers as well, since we are not computing

190

// cumsum of complete matrix, but non-zero part of it

191

cudaMemset((void*)ps->cuda_lsum_temp, 0, 4 * lsum_alloc_size2 * sizeof(float));

192

208

193

209

194

return 0;

210

195

}

233

218

ps->ncp = 0;

234

219

ps->cuda_base_buffer = NULL;

235

220

ps->fft_initialized = false;

221

ps->cudpp_initialized = false;

236

222

}

223

237

224

return ps;

238

225

}

239

226

240

static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data, const mxArray *lsum, const mxArray *denom, const mxArray *nonzero) {

227

228

static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data) {

241

229

uint8_t *dataPtr;

242

230

243

231

if (!ps->fft_initialized) {

246

234

}

247

235

248

236

int size = ps->fft_size;

237

int alloc_size = ps->fft_alloc_size;

238

239

int N = mxGetM(data);

240

int N2 = N * N;

241

242

dim3 input_block_dim(N, 1, 1);

243

dim3 input_grid_dim(N, 1, 1);

244

245

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

246

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

247

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

248

249

dataPtr = (uint8_t*)mxGetData(data);

250

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

251

252

/*

249

253

int size2 = size*size;

250

int alloc_size = ps->fft_alloc_size;

251

252

int N = mxGetM(data);

253

int N2 = N * N;

254

255

dim3 input_block_dim(N, 1, 1);

256

dim3 input_grid_dim(N, 1, 1);

257

258

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

259

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

260

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

261

262

dataPtr = (uint8_t*)mxGetData(data);

263

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

254

255

This is a memory copy based code to fill the lsum and denom buffers

264

256

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);

265

257

266

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

267

268

258

// Loading various stuff

269

259

cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);

270

260

cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);

273

263

274

264

ps->cuda_nonzero_items[icp] = N;

275

265

276

if (N%BLOCK_SIZE) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE);

277

else ps->grid_size[icp] = N / BLOCK_SIZE;

266

if (N%BLOCK_SIZE_1D) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE_1D);

267

else ps->grid_size[icp] = N / BLOCK_SIZE_1D;

278

268

279

269

cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);

270

*/

271

272

273

float *lsum_temp = ps->cuda_lsum_temp;

274

int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

275

276

cudaMemset((void*)(ps->cuda_lsum_temp + 2*step), 0, size * ps->lsum_alloc_size * sizeof(float));

277

cudaMemset((void*)(ps->cuda_lsum_temp + 3*step), 0, size * ps->lsum_alloc_size * sizeof(float));

278

279

vecPackBase<<<input_grid_dim, input_block_dim>>>(

280

cudaInputPtr, N,

281

cudaRealPtr, size,

282

lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size

283

);

284

285

/* In general we should expect non-zero denominals, therefore the Nonzero array is not computed */

286

local_sum(ps,

287

ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,

288

lsum_temp + (2 * step), lsum_temp + (3 * step),

289

lsum_temp, lsum_temp + step);

290

291

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

280

292

281

293

return cudaPtr;

282

294

}

299

311

dim3 block_dim(size / 2 + 1, 1, 1);

300

312

dim3 grid_dim(size, 1, 1);

301

313

314

dim3 output_block_dim(size, 1, 1);

315

dim3 output_grid_dim(size, 1, 1);

316

302

317

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

303

318

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

304

319

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

316

331

cudaRealPtr = ps->cuda_result_buffer;

317

332

cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);

318

333

319

uint16_t nz_items = ps->cuda_nonzero_items[icp];

320

uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;

321

322

334

float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;

323

335

float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;

324

336

325

int grids;

326

327

// printf("%i (%i %i)\n", nz_items, icp, ps->ncp);

328

337

/*

329

338

if (nz_items) {

330

grids = ps->grid_size[icp];

339

int grids = ps->grid_size[icp];

340

uint16_t nz_items = ps->cuda_nonzero_items[icp];

341

uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;

331

342

332

dim3 output_block_dim(BLOCK_SIZE, 1, 1);

343

dim3 output_block_dim(BLOCK_SIZE_1D, 1, 1);

333

344

dim3 output_grid_dim(grids, 1, 1);

334

345

346

335

347

vecCompute<<<output_grid_dim, output_block_dim>>>(

336

348

nz, cudaResultPtr,

337

349

cudaRealPtr, 1./(size2 * (N2 - 1)),

339

351

cudaDenom, denom

340

352

);

341

353

}

354

*/

355

356

vecCompute<<<output_grid_dim, output_block_dim>>>(

357

cudaResultPtr,

358

cudaRealPtr, 1./(size2 * (N2 - 1)),

359

cudaLSum, sum / (N2 * (N2 - 1)),

360

cudaDenom, denom,

361

size

362

);

342

363

343

364

res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

344

365

ar = mxGetPr(res);

357

378

358

379

359

380

static void selfClean() {

360

reportMessage("Unloading normxcorr_hw");

361

362

381

if (pstate) {

382

reportMessage("Self-cleaning normxcorr_hw instance");

383

363

384

pstateFree(pstate);

364

385

pstate = NULL;

365

386

}

384

405

385

406

const mxArray *input;

386

407

const mxArray *base;

408

409

#ifdef VALIDATE_LSUM

387

410

const mxArray *lsum;

388

411

const mxArray *denom;

389

412

const mxArray *nonzero;

413

#endif /* VALIDATE_LSUM */

390

414

391

415

double input_sum;

392

416

double input_denom;

406

430

return;

407

431

}

408

432

409

410

433

// Initialising, for now a single client is supported only

434

411

435

idMatrix = mxCreateNumericMatrix(1, 1, mxINT32_CLASS, mxREAL);

412

436

if (!idMatrix) {

413

437

reportError("Initialization is failed");

414

438

return;

415

439

}

416

440

441

417

442

// Detecting cuda devices

443

418

444

cudaGetDeviceCount(&deviceCount);

419

445

if (deviceCount) {

420

446

cudaGetDeviceProperties(&deviceProp, 0);

426

452

} else { // No cuda device, using software

427

453

id = -1;

428

454

}

429

455

430

456

431

457

if (id > 0) {

432

458

pstate = pstateInit();

439

465

pstate = NULL;

440

466

}

441

467

468

442

469

idPtr = (int32_t*)mxGetData(idMatrix);

443

470

idPtr[0] = id;

444

471

480

507

481

508

pstateFree(pstate);

482

509

pstate = NULL;

510

483

511

return;

484

512

}

485

513

514

486

515

ps = pstate;

487

516

488

517

action = (TAction)int(mxGetScalar((mxArray*)prhs[1]));

527

556

plhs[0] = fftCompute(ps, icp, input, input_sum, input_denom);

528

557

break;

529

558

case ACTION_COMPUTE_BASE:

530

if (nrhs != 7) {

531

reportError("This action expects 5 arguments, but %i is passed", nrhs - 2);

559

if ((nrhs != 4)

560

#ifdef VALIDATE_LSUM

561

&&(nrhs != 7)

562

#endif /* VALIDATE_LSUM */

563

) {

564

reportError("This action expects 2 arguments, but %i is passed", nrhs - 2);

532

565

return;

533

566

}

534

567

550

583

return;

551

584

}

552

585

553

lsum = prhs[4];

554

denom = prhs[5];

555

nonzero = prhs[6];

556

557

586

iprop = ps->fft_size;

558

if (

559

(mxGetNumberOfDimensions(lsum) != 2)||

560

(mxGetNumberOfDimensions(denom) != 2)||

561

(mxGetClassID(lsum) != mxSINGLE_CLASS)||

562

(mxGetClassID(denom) != mxSINGLE_CLASS)||

563

(mxGetClassID(nonzero) != mxUINT16_CLASS)||

564

(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||

565

(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)

587

588

#ifdef VALIDATE_LSUM

589

if (nrhs == 7) {

590

lsum = prhs[4];

591

denom = prhs[5];

592

nonzero = prhs[6];

593

if (

594

(mxGetNumberOfDimensions(lsum) != 2)||

595

(mxGetNumberOfDimensions(denom) != 2)||

596

(mxGetClassID(lsum) != mxSINGLE_CLASS)||

597

(mxGetClassID(denom) != mxSINGLE_CLASS)||

598

(mxGetClassID(nonzero) != mxUINT16_CLASS)||

599

(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||

600

(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)

566

601

567

) {

568

reportError("Invalid properties for base initialization are specified");

569

return;

602

) {

603

reportError("Invalid properties for base initialization are specified");

604

return;

605

}

606

} else {

607

lsum = NULL;

608

denom = NULL;

609

nonzero = NULL;

570

610

}

571

572

fftUploadBaseData(ps, icp, base, lsum, denom, nonzero);

611

#endif /* VALIDATE_LSUM */

612

613

fftUploadBaseData(ps, icp, base);

614

615

#ifdef VALIDATE_LSUM

616

local_sum_validate(ps, icp, lsum, denom);

617

#endif /* VALIDATE_LSUM */

618

573

619

break;

574

620

case ACTION_SETUP:

575

621

if (nrhs != 4) {

581

627

ps->ncp = iprop + 1;

582

628

583

629

iprop = (int)mxGetScalar(prhs[3]);

630

ps->corr_size = iprop;

584

631

ps->fft_size = 6 * iprop + 1;

585

632

ps->fft_size2 = ps->fft_size * ps->fft_size;

586

633

ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);

587

634

588

if (ps->fft_size2 % BLOCK_SIZE) {

589

ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE) + 1) * BLOCK_SIZE;

635

if (ps->fft_size2 % BLOCK_SIZE_1D) {

636

ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE_1D) + 1) * BLOCK_SIZE_1D;

590

637

} else {

591

638

ps->fft_alloc_size = ps->fft_size2;

592

639

}

640

641

ps->subimage_size = ps->corr_size * 4 + 1;

642

ps->lsum_size = ps->corr_size * 2 + 1;

643

644

ps->lsum_temp_size = ps->subimage_size + 2*ps->lsum_size - 1;

645

646

if (ps->fft_size % BLOCK_SIZE_2D) {

647

ps->lsum_short_aligned_size = ((ps->fft_size / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

648

} else {

649

ps->lsum_short_aligned_size = ps->fft_size;

650

}

651

652

if ((ps->lsum_temp_size) % BLOCK_SIZE_2D) {

653

ps->lsum_aligned_size = (((ps->lsum_temp_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

654

} else {

655

ps->lsum_aligned_size = ps->lsum_temp_size;

656

}

657

658

if ((ps->lsum_temp_size + ps->lsum_size) % BLOCK_SIZE_2D) {

659

ps->lsum_alloc_size = (((ps->lsum_temp_size + ps->lsum_size) / BLOCK_SIZE_2D) + 1) * BLOCK_SIZE_2D;

660

} else {

661

ps->lsum_alloc_size = ps->lsum_temp_size + ps->lsum_size;

662

}

593

663

594

664

err = fftInit(ps);

595

665