/normxcorr/trunk : revision 29

4

5

6

static void fftFree(TProcessingState *ps) {

7

if (ps->banlist) free(ps->banlist);

7

if (ps->cuda_image) cudaFree(ps->cuda_image);

8

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

9

8

10

if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);

9

11

12

if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);

10

13

if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);

11

if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);

12

14

if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);

15

13

16

14

17

if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);

15

18

if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);

16

19

17

if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);

20

if (ps->input_buffer) cudaFreeHost(ps->input_buffer);

18

21

if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);

19

if (ps->input_buffer) cudaFreeHost(ps->input_buffer);

20

22

21

23

if (ps->cuda_points) cudaFree(ps->cuda_points);

22

24

if (ps->points) cudaFreeHost(ps->points);

23

25

26

if (ps->banlist) free(ps->banlist);

27

28

if (ps->cuda_temp_buffer) cudaFree(ps->cuda_temp_buffer);

29

24

30

if (ps->cudpp_initialized) {

25

31

cudppDestroyPlan(ps->cudpp_plan);

26

32

}

27

33

28

34

if (ps->fft_initialized) {

29

cufftDestroy(ps->cufft_r2c_plan);

30

35

cufftDestroy(ps->cufft_c2r_plan);

36

cufftDestroy(ps->cufft_r2c_plan);

31

37

}

32

38

33

39

if (ps->image_buf) {

34

40

dictImageFree(ps);

35

41

}

36

42

43

memset(ps, 0, ((char*)&(ps->matlab_mode)) - ((char*)ps));

44

45

/*

37

46

#ifdef DICT_HW_MEASURE_TIMINGS

38

47

memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));

39

#else /* DICT_HW_MEASURE_TIMINGS */

48

#else

40

49

memset(ps, 0, sizeof(TProcessingState));

41

#endif /* DICT_HW_MEASURE_TIMINGS */

42

50

#endif

51

*/

43

52

}

44

53

45

static int fftInit(TProcessingState *ps) {

54

static int fftInit(TProcessingState *ps, size_t device_memory) {

46

55

CUDPPConfiguration cudpp_config;

47

56

48

57

CUDPPResult cudpp_err;

53

62

int lsum_alloc_size2 = ps->lsum_alloc_size * ps->lsum_alloc_size;

54

63

int side_alloc_size2 = ps->side_alloc_size * ps->side_alloc_size;

55

64

65

int ncp_cache;

66

int cache_memory;

56

67

57

68

cufft_err = cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_real_size, ps->fft_real_size, CUFFT_R2C);

58

69

if (cufft_err) {

83

94

84

95

ps->cudpp_initialized = true;

85

96

86

cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

87

if (cuda_err) {

88

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);

89

fftFree(ps);

90

return DICT_ERROR_CUDA_MALLOC;

91

}

92

93

94

97

size = max3(

95

98

((1 + CP_BLOCK) * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */

96

99

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */

157

160

}

158

161

cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

159

162

160

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

163

cache_memory = size + // cuda_temp_buffer

164

2 * ps->ncp_alloc_size * sizeof(float) + // cuda_points

165

CP_BLOCK * side_alloc_size2 * sizeof(uint8_t) + // cuda_input_buffer

166

2 * CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal) + // cuda_base_buffer, cuda_data_buffer

167

4 * lsum_alloc_size2 * sizeof(float) + // cuda_lsum_temp

168

ps->ncp * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)); // caches

169

170

/*

171

printf("Temp buffer : %i\n", size/1024/1024);

172

printf("Points : %i\n", 2 * ps->ncp_alloc_size * sizeof(float) / 1024 / 1024);

173

printf("Input buffer: %i\n", CP_BLOCK * side_alloc_size2 * sizeof(uint8_t) / 1024 / 1024);

174

printf("Data buffer : 2 x %i\n", CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal) / 1024 / 1024);

175

printf("Lsum temp : %i\n", 4 * lsum_alloc_size2 * sizeof(float) / 1024 / 1024);

176

printf("Cache : %i\n", ps->ncp * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)) / 1024 / 1024);

177

printf("No Cache : %i\n", CP_BLOCK * ps->fft_alloc_size * (2 * sizeof(float) + sizeof(cufftComplex)) / 1024 / 1024);

178

*/

179

180

// Counting necessary memory, here is cache memory, 64MB is considered for other needs (base and current images)

181

if ((ps->use_cache)&&((cache_memory + 67108864) > device_memory)) ps->use_cache = 0;

182

// ps->use_cache = 0;

183

184

ncp_cache = ps->use_cache?ps->ncp:CP_BLOCK;

185

186

cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ncp_cache * ps->fft_alloc_size * sizeof(cufftComplex));

187

if (cuda_err) {

188

// Try to disable caching

189

if (ps->use_cache) {

190

ps->use_cache = 0;

191

192

ncp_cache = CP_BLOCK;

193

cuda_err = cudaMalloc((void**)&ps->cuda_fft_cache, ncp_cache * ps->fft_alloc_size * sizeof(cufftComplex));

194

}

195

if (cuda_err) {

196

reportError("Device memory allocation of %u*%u*cufftComplex bytes for cuda_fft_cache is failed", ps->ncp, ps->fft_alloc_size);

197

fftFree(ps);

198

return DICT_ERROR_CUDA_MALLOC;

199

}

200

}

201

// the data could be stored across the calls

202

// cudaMemset(ps->cuda_fft_cache, 0, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

203

204

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ncp_cache * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

161

205

if (cuda_err) {

162

206

reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);

163

207

fftFree(ps);

164

208

return DICT_ERROR_CUDA_MALLOC;

165

209

}

166

210

167

cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

211

cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ncp_cache * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

168

212

if (cuda_err) {

169

213

reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);

170

214

fftFree(ps);

185

229

}

186

230

187

231

232

int fftSetupDimensions(TProcessingState *ps) {

233

cudaError cuda_err;

234

int image_size = ps->width * ps->height;

235

236

if (ps->cuda_image) cudaFree(ps->cuda_image);

237

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

238

239

cuda_err = cudaMalloc((void**)&ps->cuda_base_image, image_size * sizeof(uint8_t));

240

if (cuda_err) {

241

reportError("Device memory allocation of %u*%u*uint8_t bytes for cuda_base_image is failed", ps->width, ps->height);

242

fftFree(ps);

243

return DICT_ERROR_CUDA_MALLOC;

244

}

245

246

cuda_err = cudaMalloc((void**)&ps->cuda_image, image_size * sizeof(uint8_t));

247

if (cuda_err) {

248

reportError("Device memory allocation of %u*%u*uint8_t bytes for cuda_image is failed", ps->width, ps->height);

249

fftFree(ps);

250

return DICT_ERROR_CUDA_MALLOC;

251

}

252

253

return 0;

254

}

255

256

188

257

void pstateFree(TProcessingState *ps) {

189

258

if (ps) {

190

259

fftFree(ps);

201

270

return ps;

202

271

}

203

272

273

static inline int fftLoadBaseImage(TProcessingState *ps, const unsigned char *fullimg) {

274

int i;

275

int width = ps->width;

276

int height = ps->height;

277

278

int offset;

279

int matlab_width;

280

int real_width, real_height;

281

282

float minx, miny, maxx, maxy;

283

284

int precision = ps->precision;

285

286

int half_size = 2 * ps->corr_size;

287

int size = 2 * half_size + 1;

288

289

int ncp = ps->ncp;

290

int ncp_alloc = ps->ncp_alloc_size;

291

292

uint8_t *banlist = ps->banlist;

293

294

float *data_x = ps->points;

295

float *data_y = data_x + ncp_alloc;

296

297

float *frac_x = ps->points + 4 * ncp_alloc;

298

float *frac_y = frac_x + ncp_alloc;

299

300

unsigned char *cuda_base_image = ps->cuda_base_image;

301

302

303

minx = width - 1;

304

maxx = 0;

305

miny = height - 1;

306

maxy = 0;

307

308

for (i = 0;i < ncp;i++) {

309

float x = data_x[i] - 1;

310

float y = data_y[i] - 1;

311

312

frac_x[i] = x - round(x * precision) / precision;

313

frac_y[i] = y - round(y * precision) / precision;

314

315

int xstart = roundf(x) - half_size;

316

int ystart = roundf(y) - half_size;

317

318

int xend = xstart + size;

319

int yend = ystart + size;

320

321

if ((xstart < 0)||(ystart < 0)||(xend > (width - 1))||(yend > (height - 1))) {

322

continue;

323

}

324

325

if (xstart < minx) minx = xstart;

326

if (ystart < miny) miny = ystart;

327

if (xend > maxx) maxx = xend;

328

if (yend > maxy) maxy = yend;

329

330

banlist[i] = 0;

331

}

332

333

ps->base_minx = minx;

334

ps->base_maxx = maxx;

335

ps->base_miny = miny;

336

ps->base_maxy = maxy;

337

338

// Correcting difference of area size between base and data images

339

ps->minx = minx + ps->corr_size;

340

ps->maxx = maxx - ps->corr_size;

341

ps->miny = miny + ps->corr_size;

342

ps->maxy = maxy - ps->corr_size;

343

344

if (ps->matlab_mode) {

345

offset = minx * height + miny;

346

real_width = ceil(maxy) - floor(miny) + 1;

347

real_height = ceil(maxx) - floor(minx) + 1;

348

349

matlab_width = height;

350

} else {

351

offset = miny * width + minx;

352

real_width = ceil(maxx) - floor(minx) + 1;

353

real_height = ceil(maxy) - floor(miny) + 1;

354

355

matlab_width = width;

356

}

357

358

if ((ps->use_cache)||(real_width * real_height < ps->ncp * size * size)) {

359

ps->base_mode = 1;

360

} else {

361

ps->base_mode = 0;

362

}

363

364

if (ps->base_mode) {

365

cudaMemcpy2D(

366

cuda_base_image + offset, matlab_width * sizeof(uint8_t),

367

fullimg + offset, matlab_width * sizeof(uint8_t),

368

real_width * sizeof(uint8_t), real_height,

369

cudaMemcpyHostToDevice

370

);

371

// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);

372

}

373

374

return 0;

375

}

376

377

204

378

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {

379

int offset;

380

int matlab_width;

381

205

382

int width = ps->width;

206

383

int height = ps->height;

207

384

208

int check_mode = ((ps->base_mode)&&(!ps->mode));

209

float minx, miny, maxx, maxy;

210

211

int precision = ps->precision;

385

int image_mode = ps->base_mode;

212

386

213

387

int half_size = 2 * ps->corr_size;

214

388

int size = 2 * half_size + 1;

225

399

float *data_x = ps->points + icp;

226

400

float *data_y = data_x + ncp_alloc;

227

401

228

float *frac_x = ps->points + 4 * ncp_alloc + icp;

229

float *frac_y = frac_x + ncp_alloc;

230

231

uint8_t *img = ps->input_buffer;

232

402

unsigned char *cuda_base_image = ps->cuda_base_image;

403

233

404

float *lsum_temp = (float*)ps->cuda_lsum_temp;

234

405

int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

235

406

236

if (check_mode) {

237

minx = ps->minx;

238

maxx = ps->maxx;

239

miny = ps->miny;

240

maxy = ps->maxy;

241

}

242

243

407

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

244

408

cufftReal *cuda_base_buffer = ps->cuda_base_buffer;

245

cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;

246

float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;

247

float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;

409

410

int cache_icp = ps->use_cache?icp:0;

411

cufftComplex *cache = ps->cuda_fft_cache + cache_icp * alloc_size;

412

float *lsum_cache = ps->cuda_lsum_cache + cache_icp * alloc_size;

413

float *denom_cache = ps->cuda_denom_cache + cache_icp * alloc_size;

248

414

249

415

int blocks = calc_blocks(size, BLOCK_SIZE_1D);

250

416

int base_blocks = blocks * blocks * BLOCK_SIZE_1D;

252

418

int lsum_size = ps->lsum_size;

253

419

int lsum_alloc = ps->lsum_alloc_size;

254

420

255

421

int xstart, ystart;

422

256

423

for (int i = 0;i < ncp;i++) {

257

float x = data_x[i] - 1;

258

float y = data_y[i] - 1;

259

260

frac_x[i] = x - round(x * precision) / precision;

261

frac_y[i] = y - round(y * precision) / precision;

262

263

int xstart = roundf(x) - half_size;

264

int ystart = roundf(y) - half_size;

265

266

int xend = xstart + size;

267

int yend = xstart + size;

268

269

if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

270

continue;

271

}

272

273

if (check_mode) {

274

if (xstart < minx) minx = xstart;

275

if (ystart < miny) miny = ystart;

276

if (xend > maxx) maxx = xend;

277

if (yend > maxy) maxy = yend;

278

}

424

if (banlist[i]) continue;

425

426

xstart = roundf(data_x[i]) - half_size - 1;

427

ystart = roundf(data_y[i]) - half_size - 1;

428

429

if (ps->matlab_mode) {

430

offset = xstart * height + ystart;

431

matlab_width = height;

432

} else {

433

offset = ystart * width + xstart;

434

matlab_width = width;

435

}

436

437

if (image_mode) {

438

cudaMemcpy2D(

439

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

440

cuda_base_image + offset, matlab_width * sizeof(uint8_t),

441

size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice

442

);

443

} else {

444

cudaMemcpy2D(

445

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

446

fullimg + offset, matlab_width * sizeof(uint8_t),

447

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

448

);

449

}

450

451

/*

452

uint8_t *img = ps->input_buffer;

279

453

280

454

if (ps->matlab_mode) {

281

455

cudaMemcpy2D(

304

478

img + i * alloc_size, size * sizeof(uint8_t),

305

479

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

306

480

);

481

*/

307

482

308

banlist[i] = 0;

309

{

310

int j = i ;

311

483

312

484

if (ps->base_blocks_power < 0) {

313

485

vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(

314

cuda_input_buffer + j * side_alloc2, side_alloc,

315

cuda_base_buffer + j*alloc_size, fft_real_size,

486

cuda_input_buffer + i * side_alloc2, side_alloc,

487

cuda_base_buffer + i * alloc_size, fft_real_size,

316

488

lsum_temp + lsum_size * (lsum_alloc + 1),

317

489

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

318

490

lsum_alloc,

320

492

);

321

493

} else {

322

494

vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D>>>(

323

cuda_input_buffer + j * side_alloc2, side_alloc,

324

cuda_base_buffer + j*alloc_size, fft_real_size,

495

cuda_input_buffer + i * side_alloc2, side_alloc,

496

cuda_base_buffer + i * alloc_size, fft_real_size,

325

497

lsum_temp + lsum_size * (lsum_alloc + 1),

326

498

lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),

327

499

lsum_alloc,

331

503

332

504

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

333

505

local_sum(ps,

334

lsum_cache + j * alloc_size, denom_cache + j * alloc_size,

506

lsum_cache + i * alloc_size, denom_cache + i * alloc_size,

335

507

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

336

lsum_temp, lsum_temp + lsum_step, NULL);

337

338

// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);

339

}

340

}

341

342

for (int j = 0;j < ncp;j++) {

343

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);

344

}

345

346

if (check_mode) {

347

ps->minx = minx;

348

ps->maxx = maxx;

349

ps->miny = miny;

350

ps->maxy = maxy;

508

lsum_temp, lsum_temp + lsum_step);

509

510

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + i * alloc_size, cache + i * alloc_size);

511

}

512

513

return 0;

514

}

515

516

static inline int fftLoadImage(TProcessingState *ps, const unsigned char *fullimg) {

517

int ncp = ps->ncp;

518

int ncp_alloc = ps->ncp_alloc_size;

519

520

int width = ps->width;

521

int height = ps->height;

522

523

int half_size = ps->corr_size;

524

int size = 2 * half_size + 1;

525

526

int offset;

527

int matlab_width;

528

int real_width, real_height;

529

int xstart, ystart, xend, yend;

530

531

float minx = width - 1;

532

float maxx = 0;

533

float miny = height - 1;

534

float maxy = 0;

535

536

uint8_t *banlist = ps->banlist;

537

unsigned char *cuda_image = ps->cuda_image;

538

539

float *data_x, *data_y;

540

541

if (ps->stored) {

542

data_x = ps->res_x;

543

data_y = ps->res_y;

544

} else {

545

data_x = ps->points + 2 * ncp_alloc;

546

data_y = data_x + ncp_alloc;

547

}

548

549

for (int i = 0;i < ncp;i++) {

550

float x = data_x[i] - 1;

551

float y = data_y[i] - 1;

552

553

xstart = roundf(x) - half_size;

554

ystart = roundf(y) - half_size;

555

556

xend = xstart + size;

557

yend = ystart + size;

558

559

if (xstart < minx) minx = xstart;

560

if (ystart < miny) miny = ystart;

561

if (xend > maxx) maxx = xend;

562

if (yend > maxy) maxy = yend;

563

564

if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend > (width-1))||(yend > (height-1))) {

565

banlist[i] = 1;

566

}

567

}

568

569

ps->minx = minx;

570

ps->maxx = maxx;

571

ps->miny = miny;

572

ps->maxy = maxy;

573

574

if (ps->matlab_mode) {

575

offset = minx * height + miny;

576

real_width = ceil(maxy) - floor(miny) + 1;

577

real_height = ceil(maxx) - floor(minx) + 1;

578

579

matlab_width = height;

580

} else {

581

offset = miny * width + minx;

582

real_width = ceil(maxx) - floor(minx) + 1;

583

real_height = ceil(maxy) - floor(miny) + 1;

584

585

matlab_width = width;

586

}

587

588

if (real_width * real_height < ps->ncp * size * size) {

589

ps->mode = 1;

590

} else {

591

ps->mode = 0;

592

}

593

//ps->mode = 0;

594

595

if (ps->mode) {

596

cudaMemcpy2D(

597

cuda_image + offset, matlab_width * sizeof(uint8_t),

598

fullimg + offset, matlab_width * sizeof(uint8_t),

599

real_width * sizeof(uint8_t), real_height,

600

cudaMemcpyHostToDevice

601

);

602

// cudaMemcpy(cuda_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);

351

603

}

352

604

353

354

605

return 0;

355

606

}

356

607

357

358

static inline int fftCopyFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {

608

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *fullimg) {

609

int image_mode = ps->mode;

610

611

int offset;

612

int matlab_width;

613

359

614

int width = ps->width;

360

615

int height = ps->height;

361

616

364

619

int size2 = size * size;

365

620

int ncp_alloc = ps->ncp_alloc_size;

366

621

622

int side_alloc = ps->side_alloc_size;

623

int side_alloc2 = side_alloc * side_alloc;

624

625

unsigned char *cuda_image = ps->cuda_image;

626

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

627

367

628

float *data_x, *data_y;

368

629

if (ps->stored) {

369

630

data_x = ps->res_x + icp;

382

643

383

644

int xstart = roundf(x) - half_size;

384

645

int ystart = roundf(y) - half_size;

385

386

int xend = xstart + size;

387

int yend = ystart + size;

388

646

389

if ((banlist[i])||(xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {

390

banlist[i] = 1;

647

if (banlist[i]) {

648

//printf("Banned: %i\n", icp + i);

391

649

continue;

392

650

}

393

651

394

652

if (ps->matlab_mode) {

395

cudaMemcpy2D(

396

img + i * size2,//alloc_size,

397

size * sizeof(uint8_t),

398

fullimg + (xstart * height + ystart),

399

height * sizeof(uint8_t),

400

size * sizeof(uint8_t),

401

size,

402

cudaMemcpyHostToHost

403

);

404

} else {

405

cudaMemcpy2D(

406

img + i * size2,//alloc_size,

407

size * sizeof(uint8_t),

408

fullimg + (ystart * width + xstart),

409

width * sizeof(uint8_t),

410

size * sizeof(uint8_t),

411

size,

412

cudaMemcpyHostToHost

413

);

414

}

415

}

416

return 0;

417

}

418

419

static inline int fftLoadFragment(TProcessingState *ps, int icp, int ncp, const unsigned char *image, cudaStream_t stream0) {

420

int i;

421

int half_size = ps->corr_size;

422

int size = 2 * half_size + 1;

423

424

int side_alloc = ps->side_alloc_size;

425

426

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

427

uint8_t *img = ps->input_buffer;

428

429

430

for (i = 0;i < ncp;i++) {

431

cudaMemcpy2D(

432

cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),

433

img + i * size * size, size * sizeof(uint8_t),

434

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

435

);

436

}

437

653

offset = xstart * height + ystart;

654

matlab_width = height;

655

} else {

656

offset = ystart * width + xstart;

657

matlab_width = width;

658

}

659

660

if (image_mode) {

661

cudaMemcpy2D(

662

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

663

cuda_image + offset, matlab_width * sizeof(uint8_t),

664

size * sizeof(uint8_t), size, cudaMemcpyDeviceToDevice

665

);

666

} else {

667

#ifdef WIN32

668

cudaMemcpy2D(

669

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

670

fullimg + offset, matlab_width * sizeof(uint8_t),

671

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

672

);

673

#else /* WIN32 */

674

cudaMemcpy2D(

675

img + i * size2, size * sizeof(uint8_t),

676

fullimg + offset, matlab_width * sizeof(uint8_t),

677

size * sizeof(uint8_t), size,

678

cudaMemcpyHostToHost

679

);

438

680

/*

439

cudaMemcpy3DParms copy_params = { 0 };

440

441

copy_params.dstPtr = make_cudaPitchedPtr(

442

cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc

443

);

444

copy_params.srcPtr = make_cudaPitchedPtr(

445

img, size * sizeof(uint8_t), size, size

446

);

447

copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);

448

copy_params.kind = cudaMemcpyHostToDevice;

449

450

cudaMemcpy3DAsync(&copy_params, stream0);

681

cudaMemcpy2D(

682

cuda_input_buffer + i * side_alloc * side_alloc, side_alloc * sizeof(uint8_t),

683

img + i * size2, size * sizeof(uint8_t),

684

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

685

);

451

686

*/

452

687

#endif /* WIN32 */

688

}

689

690

}

691

692

#ifndef WIN32

693

if (!image_mode) {

694

cudaMemcpy3DParms copy_params = { 0 };

695

696

copy_params.dstPtr = make_cudaPitchedPtr(

697

cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc

698

);

699

copy_params.srcPtr = make_cudaPitchedPtr(

700

img, size * sizeof(uint8_t), size, size

701

);

702

copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);

703

copy_params.kind = cudaMemcpyHostToDevice;

704

705

cudaMemcpy3D(&copy_params);

706

}

707

#endif /* !WIN32 */

708

453

709

return 0;

454

710

}

455

711

712

456

713

static dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);

457

714

static dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);

458

715

525

782

526

783

float *sumbuf = ps->cuda_points + icp;

527

784

float *stdbuf = ps->cuda_points + ncp_alloc + icp;

785

786

int cache_icp = ps->use_cache?icp:0;

528

787

529

788

// Use real size everthere

530

789

// int fft2_blocks = calc_blocks(fft_size*fft_real_size, SIDE_BLOCK_SIZE);

531

790

// vecCompute<<<compute_grid_dim, block_side_cp,0,stream0>>>(

532

791

// cuda_final_buffer,

533

792

// cuda_result_buffer, 1./(fft_real_size * fft_real_size * (size2 - 1)),

534

// ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

535

// ps->cuda_denom_cache + icp*alloc_size, stdbuf,

793

// ps->cuda_lsum_cache + cache_icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

794

// ps->cuda_denom_cache + cache_icp*alloc_size, stdbuf,

536

795

// alloc_size

537

796

// );

538

797

543

802

vecCompute<<<compute_grid_dim, block_side_cp, 0, stream0>>>(

544

803

cuda_final_buffer, fft_size,

545

804

cuda_result_buffer, fft_real_size, 1./(fft_real_size * fft_real_size * (size2 - 1)),

546

ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

547

ps->cuda_denom_cache + icp*alloc_size, stdbuf,

805

ps->cuda_lsum_cache + cache_icp * alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

806

ps->cuda_denom_cache + cache_icp * alloc_size, stdbuf,

548

807

alloc_size, fft_blocks

549

808

);

550

809

577

836

float *cuda_data_buffer = ps->cuda_data_buffer;

578

837

579

838

int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);

839

int cache_icp = ps->use_cache?icp:0;

580

840

581

841

// Performing FFT's

582

842

cufftComplex *cuda_fft_buffer = ((cufftComplex*)ps->cuda_temp_buffer) + alloc_size;

591

851

592

852

int complex_blocks = calc_blocks(fft_real_size * (fft_real_size / 2 + 1), SIDE_BLOCK_SIZE);

593

853

dim3 complex_grid_dim(complex_blocks, cp_blocks, 1);

594

vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + icp*alloc_size, alloc_size, fft_real_size/2+1);

854

vecMul<<<complex_grid_dim,block_side_cp,0,stream0>>>(cuda_fft_buffer, ps->cuda_fft_cache + cache_icp * alloc_size, alloc_size, fft_real_size/2+1);

595

855

596

856

// First in-place transform for some reason is failing, therefore we

597

857

// have one alloc_size spacing between starts (see cuda_fft_buffer set above)

667

927

float *frac_y = frac_x + ncp_alloc;

668

928

uint8_t *banlist = ps->banlist;

669

929

930

670

931

for (int i = 0;i < ncp;i++) {

671

932

if (banlist[i]) {

672

933

res_x[i] = data_x[i];

673

934

res_y[i] = data_y[i];

674

continue;

675

}

676

677

frac = data_x[i] - round(data_x[i]*precision)/precision;

678

res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

679

680

frac = data_y[i] - round(data_y[i]*precision)/precision;

681

res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);

935

} else {

936

frac = data_x[i] - round(data_x[i]*precision)/precision;

937

res_x[i] = (data_x[i] - move_x[i]) + (frac_x[i] - frac);

938

939

frac = data_y[i] - round(data_y[i]*precision)/precision;

940

res_y[i] = (data_y[i] - move_y[i]) + (frac_y[i] - frac);

941

}

682

942

}

943

683

944

684

945

return 0;

685

946

}