/normxcorr/trunk : revision 11

8

#include "normxcorr_hw_msg.h"

9

#include "normxcorr_hw_kernel.cu"

10

11

#define max4(a,b,c,d) max2(max2(a,b),max2(c,d))

11

12

#define max3(a,b,c) max2(max2(a,b),c)

12

13

#define max2(a,b) (((a)>(b))?(a):(b))

13

14

#define min2(a,b) (((a)<(b))?(a):(b))

27

28

if (ps->banlist) free(ps->banlist);

28

29

if (ps->cuda_lsum_temp) cudaFree(ps->cuda_lsum_temp);

29

30

if (ps->cuda_lsum_buffer) cudaFree(ps->cuda_lsum_buffer);

31

if (ps->cuda_denom_buffer) cudaFree(ps->cuda_denom_buffer);

31

if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);

32

if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);

32

33

if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);

33

34

35

if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);

38

39

if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);

39

40

if (ps->input_buffer) cudaFreeHost(ps->input_buffer);

40

41

if (ps->cuda_cp) cudaFree(ps->cuda_cp);

42

if (ps->cuda_points) cudaFree(ps->cuda_points);

42

43

if (ps->points) cudaFreeHost(ps->points);

43

44

45

// DS: Source of bug, that occasionaly can corrupt something ...

108

109

size = max3(

109

110

(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */

110

111

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */

111

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */

112

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)), /* Max of correlation */

112

113

);

113

114

115

cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);

133

134

return ERROR_CUDA_MALLOC;

134

135

}

135

136

cuda_err = cudaMalloc((void**)&ps->cuda_cp, 2 * ps->ncp_alloc_size * sizeof(float));

137

cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));

137

138

if (cuda_err) {

138

139

reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

139

140

fftFree(ps);

170

171

}

171

172

cudaMemset((void*)ps->cuda_data_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));

172

173

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

174

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

174

175

if (cuda_err) {

175

reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_buffer is failed", ps->ncp, ps->fft_alloc_size);

176

reportError("Device memory allocation of %u*%u*float bytes for cuda_lsum_cache is failed", ps->ncp, ps->fft_alloc_size);

176

177

fftFree(ps);

177

178

return ERROR_CUDA_MALLOC;

178

179

}

179

180

cuda_err = cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

181

cuda_err = cudaMalloc((void**)&ps->cuda_denom_cache, ps->ncp * ps->fft_alloc_size * sizeof(float) + lsum_alloc_size2 * sizeof(float));

181

182

if (cuda_err) {

182

reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_buffer is failed", ps->ncp, ps->fft_alloc_size);

183

reportError("Device memory allocation of %u*%u*float bytes for cuda_denom_cache is failed", ps->ncp, ps->fft_alloc_size);

183

184

fftFree(ps);

184

185

return ERROR_CUDA_MALLOC;

185

186

}

210

211

}

211

212

213

static void fftPrepare(TProcessingState *ps) {

213

/*

214

if (ps->fft_initialized) {

215

// Since template and current image have different neighbourhoud sizes

216

cudaMemset((void*)ps->cuda_data_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

217

}

218

*/

219

214

}

220

215

221

216

234

229

if (ps) memset(ps, 0, sizeof(TProcessingState));

235

230

return ps;

236

231

}

237

/*

238

static inline int fftCalibrate(TProcessingState *ps, const mxArray *image) {

239

int width = mxGetN(image);

240

int height = mxGetM(image);

241

242

int size = 2 * ps->corr_size + 1;

243

int size2 = size * size;

244

245

int base_size = 4 * ps->corr_size + 1;

246

int base_size2 = base_size * base_size;

247

248

// printf("%u %u %u\n", width*height, ps->ncp*size2, ps->ncp*base_size2);

249

250

if (width * height > ps->ncp * size2) {

251

ps->mode = 0;

252

} else {

253

ps->mode = 1;

254

}

255

256

// if not enoguh space for caching enable anyway ?

257

if (width * height > ps->ncp * base_size2) {

258

ps->base_mode = 0;

259

} else {

260

ps->base_mode = 1;

261

if (!ps->mode) {

262

ps->minx = 0;

263

ps->maxx = width - 1;

264

ps->miny = 0;

265

ps->maxy = height - 1;

266

}

267

}

268

269

return 0;

270

}

271

*/

272

232

273

233

static inline int fftLoadBaseFragment(TProcessingState *ps, int icp, int ncp, const mxArray *image) {

274

234

int width = mxGetN(image);

300

260

uint8_t *fullimg = ((uint8_t*)mxGetData(image));

301

261

uint8_t *img = ps->input_buffer;

302

262

303

float *lsum_temp = ps->cuda_lsum_temp;

263

float *lsum_temp = (float*)ps->cuda_lsum_temp;

304

264

int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

305

265

306

266

dim3 input_block_dim(size, 1, 1);

362

322

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

363

323

);

364

324

365

cudaMemset((void*)(ps->cuda_lsum_temp + 2*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));

366

cudaMemset((void*)(ps->cuda_lsum_temp + 3*lsum_step), 0, fft_size * ps->lsum_alloc_size * sizeof(float));

367

325

368

326

vecPackBase<<<input_grid_dim, input_block_dim>>>(

369

327

cudaInputPtr, side_alloc,

373

331

374

332

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

375

333

local_sum(ps,

376

ps->cuda_lsum_buffer + (icp + i) * alloc_size, ps->cuda_denom_buffer + (icp + i) * alloc_size,

334

ps->cuda_lsum_cache + (icp + i) * alloc_size, ps->cuda_denom_cache + (icp + i) * alloc_size,

377

335

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

378

336

lsum_temp, lsum_temp + lsum_step);

379

337

474

432

// Computing sum and std

475

433

int32_t *stat_buf = (int*)ps->cuda_temp_buffer;

476

434

477

float *sumbuf = ps->cuda_cp + icp;

478

float *stdbuf = ps->cuda_cp + ps->ncp_alloc_size + icp;

435

float *sumbuf = ps->cuda_points + icp;

436

float *stdbuf = ps->cuda_points + ps->ncp_alloc_size + icp;

479

437

480

438

dim3 stat_grid_dim(side_blocks, cp_blocks, 1);

481

439

stat1<<<stat_grid_dim, block_side_cp>>>(stat_buf, stat_buf + side_alloc * CP_BLOCK, ps->cuda_input_buffer, side_alloc2, side_alloc, size);

535

493

vecCompute<<<compute_grid_dim, block_side_cp>>>(

536

494

cuda_final_buffer,

537

495

cuda_result_buffer, 1./(fft_size2 * (size2 - 1)),

538

ps->cuda_lsum_buffer + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

539

ps->cuda_denom_buffer + icp*alloc_size, stdbuf,

496

ps->cuda_lsum_cache + icp*alloc_size, sumbuf, 1. / (size2 * (size2 - 1)),

497

ps->cuda_denom_cache + icp*alloc_size, stdbuf,

540

498

alloc_size, fft_size

541

499

);

542

500

584

542

585

543

cudaMemcpy2D(

586

544

move_x, ncp_alloc * sizeof(float),

587

ps->cuda_cp, ncp_alloc * sizeof(float),

545

ps->cuda_points, ncp_alloc * sizeof(float),

588

546

ps->ncp * sizeof(float), 2,

589

547

cudaMemcpyDeviceToHost

590

548

);

657

615

658

616

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

659

617

local_sum(ps,

660

ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,

618

ps->cuda_lsum_cache + icp * alloc_size, ps->cuda_denom_cache + icp * alloc_size,

661

619

lsum_temp + (2 * step), lsum_temp + (3 * step),

662

620

lsum_temp, lsum_temp + step);

663

621

698

656

699

657

cudaMemcpy2D(

700

658

move_x, ncp_alloc * sizeof(float),

701

ps->cuda_cp, ncp_alloc * sizeof(float),

659

ps->cuda_points, ncp_alloc * sizeof(float),

702

660

ps->ncp * sizeof(float), 2,

703

661

cudaMemcpyDeviceToHost

704

662

);