/normxcorr/trunk : revision 32

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/normxcorr_hw.cu.h

Committer: Suren A. Chilingaryan
Date: 2010-04-25 04:39:54 UTC
Revision ID: csa@dside.dyndns.org-20100425043954-v7xm2bzohickyl9z

Multi-GPU support

files added:
dict_hw/src/hw_sched.c

dict_hw/src/hw_sched.h

dict_hw/src/hw_thread.c

dict_hw/src/hw_thread.h

files modified:
dict_hw/CMakeLists.txt

dict_hw/README

dict_hw/cmake/FindGLIB2.cmake

dict_hw/src/CMakeLists.txt

dict_hw/src/dict_hw.cu

dict_hw/src/dict_hw.h

dict_hw/src/dict_hw_test.cpp

dict_hw/src/normxcorr_hw.cu.h

dict_hw/src/normxcorr_hw.h

dict_hw/src/validate.cpp

Show diffs side-by-side

added added

removed removed

dict_hw/src/normxcorr_hw.cu.h

static void fftFree(TProcessingState *ps) {

if (ps->base_image) free(ps->base_image);

if (ps->cuda_image) cudaFree(ps->cuda_image);

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

if (ps->cuda_denom_cache) cudaFree(ps->cuda_denom_cache);

if (ps->cuda_lsum_cache) cudaFree(ps->cuda_lsum_cache);

if (ps->cuda_fft_cache) cudaFree(ps->cuda_fft_cache);

if (ps->cuda_data_buffer) cudaFree(ps->cuda_data_buffer);

if (ps->cuda_base_buffer) cudaFree(ps->cuda_base_buffer);

if (ps->cuda_input_buffer) cudaFree(ps->cuda_input_buffer);

if (ps->cuda_points) cudaFree(ps->cuda_points);

if (ps->points) cudaFreeHost(ps->points);

if (ps->banlist) free(ps->banlist);

cufftDestroy(ps->cufft_r2c_plan);

}

if (ps->image_buf) {

dictImageFree(ps);

}

memset(ps, 0, ((char*)&(ps->matlab_mode)) - ((char*)ps));

#ifdef DICT_HW_MEASURE_TIMINGS

memset(ps, 0, sizeof(TProcessingState) - sizeof(ps->time));

#else

memset(ps, 0, sizeof(TProcessingState));

#endif

}

static int fftInit(TProcessingState *ps, size_t device_memory) {

116

101

}

117

102

memset(ps->banlist, 1, ps->ncp * sizeof(uint8_t));

118

103

119

cuda_err = cudaHostAlloc((void**)&ps->points, 8 * ps->ncp_alloc_size * sizeof(float), 0);

120

if (cuda_err) {

121

reportError("Page locked host memory allocation of 8*%u*float bytes for control points is failed", ps->ncp_alloc_size);

122

fftFree(ps);

123

return DICT_ERROR_CUDA_MALLOC;

124

}

125

126

104

cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));

127

105

if (cuda_err) {

128

106

reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);

179

157

180

158

// Counting necessary memory, here is cache memory, 64MB is considered for other needs (base and current images)

181

159

if ((ps->use_cache)&&((cache_memory + CUDA_EXTRA_MEMORY) > device_memory)) ps->use_cache = 0;

160

//ps->use_cache = 0;

182

161

183

162

ncp_cache = ps->use_cache?ps->ncp:CP_BLOCK;

184

163

186

165

if (cuda_err) {

187

166

// Try to disable caching

188

167

if (ps->use_cache) {

168

cudaGetLastError();

169

189

170

ps->use_cache = 0;

190

171

191

172

ncp_cache = CP_BLOCK;

232

213

cudaError cuda_err;

233

214

int image_size = ps->width * ps->height;

234

215

235

if (ps->base_image) free(ps->base_image);

236

216

if (ps->cuda_image) cudaFree(ps->cuda_image);

237

217

if (ps->cuda_base_image) cudaFree(ps->cuda_base_image);

238

218

250

230

return DICT_ERROR_CUDA_MALLOC;

251

231

}

252

232

253

ps->base_image = (unsigned char*)malloc(image_size * sizeof(uint8_t));

254

if (!ps->base_image) {

255

reportError("Memory allocation of %u*%u*uint8_t bytes for base_image is failed", ps->width, ps->height);

256

fftFree(ps);

257

return DICT_ERROR_MALLOC;

258

}

259

233

260

234

return 0;

261

235

}

294

268

int size = 2 * half_size + 1;

295

269

296

270

int ncp = ps->ncp;

297

int ncp_alloc = ps->ncp_alloc_size;

298

271

299

272

uint8_t *banlist = ps->banlist;

300

273

274

int points_alloc = ps->points_alloc_size;

275

301

276

float *data_x = ps->points;

302

float *data_y = data_x + ncp_alloc;

277

float *data_y = data_x + points_alloc;

303

278

304

float *frac_x = ps->points + 4 * ncp_alloc;

305

float *frac_y = frac_x + ncp_alloc;

279

float *frac_x = ps->points + 4 * points_alloc;

280

float *frac_y = frac_x + points_alloc;

306

281

307

282

unsigned char *cuda_base_image = ps->cuda_base_image;

308

283

376

351

cudaMemcpyHostToDevice

377

352

);

378

353

// cudaMemcpy(cuda_base_image, fullimg, width*height*sizeof(uint8_t), cudaMemcpyHostToDevice);

379

} else if (!ps->use_cache) {

380

memcpy(ps->base_image, fullimg, width * height * sizeof(uint8_t));

381

354

}

382

355

383

356

return 0;

399

372

400

373

int fft_real_size = ps->fft_real_size;

401

374

402

int ncp_alloc = ps->ncp_alloc_size;

403

375

int alloc_size = ps->fft_alloc_size;

404

376

int side_alloc = ps->side_alloc_size;

405

377

int side_alloc2 = side_alloc * side_alloc;

406

378

407

379

uint8_t *banlist = ps->banlist + icp;

408

380

381

int points_alloc = ps->points_alloc_size;

409

382

float *data_x = ps->points + icp;

410

float *data_y = data_x + ncp_alloc;

383

float *data_y = data_x + points_alloc;

411

384

412

385

unsigned char *cuda_base_image = ps->cuda_base_image;

413

386

430

403

int lsum_alloc = ps->lsum_alloc_size;

431

404

432

405

int xstart, ystart;

433

406

434

407

for (int i = 0;i < ncp;i++) {

435

408

if (banlist[i]) continue;

436

409

445

418

matlab_width = width;

446

419

}

447

420

421

422

if (!(i+icp)) {

423

printf("lbf0: %i %i - %i\n", xstart, ystart, fullimg[offset + size*matlab_width/2 + size/2]);

424

}

425

426

448

427

if (image_mode) {

449

428

cudaMemcpy2D(

450

429

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

516

495

517

496

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + i * alloc_size, cache + i * alloc_size);

518

497

}

498

519

499

520

500

return 0;

521

501

}

522

502

523

503

static inline int fftLoadImage(TProcessingState *ps, const unsigned char *fullimg) {

524

504

int ncp = ps->ncp;

525

int ncp_alloc = ps->ncp_alloc_size;

526

505

527

506

int width = ps->width;

528

507

int height = ps->height;

545

524

546

525

float *data_x, *data_y;

547

526

527

int points_alloc = ps->points_alloc_size;

528

548

529

if (ps->stored) {

549

530

data_x = ps->res_x;

550

531

data_y = ps->res_y;

551

532

} else {

552

data_x = ps->points + 2 * ncp_alloc;

553

data_y = data_x + ncp_alloc;

533

data_x = ps->points + 2 * points_alloc;

534

data_y = data_x + points_alloc;

554

535

}

555

536

556

537

for (int i = 0;i < ncp;i++) {

623

604

int half_size = ps->corr_size;

624

605

int size = 2 * half_size + 1;

625

606

int size2 = size * size;

626

int ncp_alloc = ps->ncp_alloc_size;

627

607

628

608

int side_alloc = ps->side_alloc_size;

629

609

int side_alloc2 = side_alloc * side_alloc;

631

611

unsigned char *cuda_image = ps->cuda_image;

632

612

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

633

613

614

int points_alloc = ps->points_alloc_size;

634

615

float *data_x, *data_y;

635

616

if (ps->stored) {

636

617

data_x = ps->res_x + icp;

637

618

data_y = ps->res_y + icp;

638

619

} else {

639

data_x = ps->points + 2 * ncp_alloc + icp;

640

data_y = data_x + ncp_alloc;

620

data_x = ps->points + 2 * points_alloc + icp;

621

data_y = data_x + points_alloc;

641

622

}

642

623

643

624

uint8_t *img = ps->input_buffer;

857

838

}

858

839

859

840

860

static inline int fftGetCurrentPoints(DICTContext ps) {

841

static inline int fftGetCurrentPoints(TProcessingState *ps) {

861

842

int ncp = ps->ncp;

862

843

int ncp_alloc = ps->ncp_alloc_size;

863

844

int precision = ps->precision;

872

853

// Therefore, it is better to use matlab mode until the computation

873

854

// code is changed (this implementation is just done to accept

874

855

// images from user apps without transposing)

856

int points_alloc = ps->points_alloc_size;

875

857

if (ps->matlab_mode) {

876

move_x = ps->points + 6 * ncp_alloc;

877

move_y = move_x + ncp_alloc;

858

move_x = ps->points + 6 * points_alloc;

859

move_y = move_x + points_alloc;

878

860

879

861

cudaMemcpy2D(

880

move_x, ncp_alloc * sizeof(float),

862

move_x, points_alloc * sizeof(float),

881

863

ps->cuda_points, ncp_alloc * sizeof(float),

882

864

ps->ncp * sizeof(float), 2,

883

865

cudaMemcpyDeviceToHost

884

866

);

867

868

printf("%f %f\n", move_x[0], move_x[1]);

885

869

} else {

886

move_y = ps->points + 6 * ncp_alloc;

887

move_x = move_y + ncp_alloc;

870

move_y = ps->points + 6 * points_alloc;

871

move_x = move_y + points_alloc;

888

872

889

873

cudaMemcpy2D(

890

move_y, ncp_alloc * sizeof(float),

874

move_y, points_alloc * sizeof(float),

891

875

ps->cuda_points, ncp_alloc * sizeof(float),

892

876

ps->ncp * sizeof(float), 2,

893

877

cudaMemcpyDeviceToHost

899

883

data_x = ps->res_x;

900

884

data_y = ps->res_y;

901

885

} else {

902

data_x = ps->points + 2 * ncp_alloc;

903

data_y = data_x + ncp_alloc;

886

data_x = ps->points + 2 * points_alloc;

887

data_y = data_x + points_alloc;

904

888

}

905

889

906

890

float *res_x, *res_y;

915

899

}

916

900

917

901

float frac;

918

float *frac_x = ps->points + 4 * ncp_alloc;

919

float *frac_y = frac_x + ncp_alloc;

902

float *frac_x = ps->points + 4 * points_alloc;

903

float *frac_y = frac_x + points_alloc;

920

904

uint8_t *banlist = ps->banlist;

921

905

922

906

Older »