/normxcorr/trunk : revision 1

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-01-15 13:50:29 UTC
Revision ID: csa@dside.dyndns.org-20090115135029-wleapicg9a4593tp

Initial import

files added:

Markerplotting.m

RTCorrCode.m

automate_image.m

cuda

cuda/Makefile

cuda/normxcorr_hw.cu

cuda/normxcorr_hw_kernel.cu

cuda/normxcorr_hw_msg.h

cuda/nvmex

cuda/nvopts.sh

dic_calc_rects.m

dic_cpcorr.m

dic_local_sum.m

displacement.m

docs

docs/Correlation_Guide_2.htm

docs/Correlation_Guide_2_files

docs/Correlation_Guide_2_files/filelist.xml

docs/Correlation_Guide_2_files/image001.jpg

docs/Correlation_Guide_2_files/image002.jpg

docs/Correlation_Guide_2_files/image003.jpg

docs/Correlation_Guide_2_files/image004.jpg

docs/Correlation_Guide_2_files/image005.jpg

docs/Correlation_Guide_2_files/image006.jpg

docs/Correlation_Guide_2_files/image007.jpg

docs/Correlation_Guide_2_files/image008.jpg

docs/Correlation_Guide_2_files/image009.jpg

docs/Correlation_Guide_2_files/image010.jpg

docs/Correlation_Guide_2_files/image011.jpg

docs/Correlation_Guide_2_files/image012.jpg

docs/Correlation_Guide_2_files/image013.jpg

docs/Correlation_Guide_2_files/image014.jpg

docs/Correlation_Guide_2_files/image015.jpg

docs/Correlation_Guide_2_files/image016.jpg

docs/Correlation_Guide_2_files/image017.jpg

docs/Correlation_Guide_2_files/image018.jpg

docs/Correlation_Guide_2_files/image019.jpg

docs/Correlation_Guide_2_files/image020.jpg

docs/Correlation_Guide_2_files/image021.jpg

docs/Correlation_Guide_2_files/image022.jpg

docs/Correlation_Guide_2_files/image023.jpg

docs/Correlation_Guide_2_files/image024.jpg

docs/Correlation_Guide_2_files/image025.jpg

docs/Correlation_Guide_2_files/image026.jpg

docs/Correlation_Guide_2_files/image027.jpg

docs/Correlation_Guide_2_files/image028.jpg

docs/Correlation_Guide_2_files/image029.jpg

docs/Correlation_Guide_2_files/image030.jpg

docs/Correlation_Guide_2_files/image031.jpg

docs/Correlation_Guide_2_files/image032.jpg

docs/Correlation_Guide_2_files/image033.jpg

docs/Correlation_Guide_2_files/image034.jpg

docs/Correlation_Guide_2_files/image035.jpg

docs/Correlation_Guide_2_files/image036.jpg

docs/Correlation_Guide_2_files/image037.jpg

filelist_generator.m

findpeak.m

gauss_onepk.m

gauss_twopk.m

grid_generator.m

jobskript.m

line_visual.m

linearfit.m

misc

misc/VERSION

misc/release

peak_labelling.m

pickpeak.m

ppselection_func.m

quality_check.m

resume_automate_image.m

sortvalidpoints.m

strain_lineprofile_marker.m

stress_strainmatch.m

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

#include <stdio.h>

#include <stdint.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <cublas.h>

#include <cufft.h>

#include <mex.h>

#define BLOCK_SIZE 64

//#include "normxcorr_hw_pack.h"

#include "normxcorr_hw_msg.h"

#include "normxcorr_hw_kernel.cu"

typedef enum {

ACTION_SETUP = 1,

ACTION_PREPARE = 2,

ACTION_COMPUTE_BASE = 10,

ACTION_COMPUTE_FRAGMENT = 11,

} TAction;

struct STProcessingState {

cufftComplex *cuda_base_buffer; // Stored FFT's of the template image

cufftComplex *cuda_data_buffer; // Main computational buffer

cufftReal *cuda_temp_buffer; // Temporary buffer for FFT inputs

cufftReal *cuda_result_buffer; // Temporary buffer for FFT outputs

float *cuda_final_buffer; // Ultimate output

uint8_t *cuda_input_buffer; // Input buffer

float *cuda_lsum_buffer;

float *cuda_denom_buffer;

int *grid_size;

uint16_t *cuda_nonzero_items;

uint16_t *cuda_nonzero_buffer;

int ncp; // Number of control points

int fft_size; // Matrix Size for FFT (base_size + input_size - 1)

int fft_size2; // size * size

int fft_alloc_size; // cuda optimized size2

int fft_inner_size; // size * (size/2 + 1), R2C/C2R

int fft_initialized; // Flag indicating if CUFFT plan is initialized

cufftHandle cufft_plan;

cufftHandle cufft_r2c_plan;

cufftHandle cufft_c2r_plan;

};

typedef struct STProcessingState TProcessingState;

static TProcessingState *pstate = NULL;

static void fftFree(TProcessingState *ps) {

if (ps->fft_initialized) {

cufftDestroy(ps->cufft_r2c_plan);

cufftDestroy(ps->cufft_c2r_plan);

// cufftDestroy(ps->cufft_plan);

// cublasShutdown();

ps->fft_initialized = false;

}

if (ps->cuda_base_buffer) {

free(ps->grid_size);

free(ps->cuda_nonzero_items);

cudaFree(ps->cuda_lsum_buffer);

cudaFree(ps->cuda_denom_buffer);

cudaFree(ps->cuda_nonzero_buffer);

cudaFree(ps->cuda_temp_buffer);

cudaFree(ps->cuda_final_buffer);

cudaFree(ps->cuda_result_buffer);

cudaFree(ps->cuda_data_buffer);

cudaFree(ps->cuda_base_buffer);

cudaFree(ps->cuda_input_buffer);

ps->cuda_base_buffer = NULL;

}

static void fftInit(TProcessingState *ps) {

fftFree(ps);

// cublasInit();

// cufftPlan2d(&ps->cufft_plan, ps->fft_size, ps->fft_size, CUFFT_C2C);

cufftPlan2d(&ps->cufft_r2c_plan, ps->fft_size, ps->fft_size, CUFFT_R2C);

cufftPlan2d(&ps->cufft_c2r_plan, ps->fft_size, ps->fft_size, CUFFT_C2R);

ps->fft_initialized = true;

cudaMalloc((void**)&ps->cuda_base_buffer, ps->ncp * ps->fft_alloc_size * sizeof(cufftComplex));

100

cudaMalloc((void**)&ps->cuda_data_buffer, ps->fft_alloc_size * sizeof(cufftComplex));

101

cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t));

102

cudaMalloc((void**)&ps->cuda_final_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

103

cudaMemset((void*)ps->cuda_final_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(float));

104

cudaMalloc((void**)&ps->cuda_result_buffer, ps->fft_alloc_size * sizeof(cufftReal));

105

cudaMalloc((void**)&ps->cuda_temp_buffer, ps->fft_alloc_size * sizeof(cufftReal));

106

cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

107

108

cudaMalloc((void**)&ps->cuda_lsum_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

109

cudaMalloc((void**)&ps->cuda_denom_buffer, ps->ncp * ps->fft_alloc_size * sizeof(float));

110

cudaMalloc((void**)&ps->cuda_nonzero_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

111

cudaMemset((void*)ps->cuda_nonzero_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

112

113

ps->cuda_nonzero_items = (uint16_t*)malloc(ps->ncp * sizeof(uint16_t));

114

115

ps->grid_size = (int*)malloc(ps->ncp*sizeof(int));

116

}

117

118

static void fftPrepare(TProcessingState *ps) {

119

if (ps->fft_initialized) {

120

// Since template and current image have different neighbourhoud sizes

121

cudaMemset((void*)ps->cuda_temp_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));

122

}

123

}

124

125

126

127

void pstateFree(TProcessingState *ps) {

128

if (ps) {

129

fftFree(ps);

130

free(ps);

131

}

132

}

133

134

TProcessingState *pstateInit() {

135

TProcessingState *ps;

136

137

ps = (TProcessingState*)malloc(sizeof(TProcessingState));

138

if (ps) {

139

ps->ncp = 0;

140

ps->cuda_base_buffer = NULL;

141

ps->fft_initialized = false;

142

}

143

return ps;

144

}

145

146

static inline void *fftUploadBaseData(TProcessingState *ps, int icp, const mxArray *data, const mxArray *lsum, const mxArray *denom, const mxArray *nonzero) {

147

uint8_t *dataPtr;

148

149

if (!ps->fft_initialized) {

150

reportError("cuFFT engine is not initialized yet");

151

return NULL;

152

}

153

154

int size = ps->fft_size;

155

int size2 = size*size;

156

int alloc_size = ps->fft_alloc_size;

157

158

int N = mxGetM(data);

159

int N2 = N * N;

160

161

dim3 input_block_dim(N, 1, 1);

162

dim3 input_grid_dim(N, 1, 1);

163

164

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

165

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

166

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

167

168

dataPtr = (uint8_t*)mxGetData(data);

169

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

170

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);

171

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

172

173

// Loading various stuff

174

cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);

175

cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);

176

177

N = mxGetM(nonzero);

178

179

ps->cuda_nonzero_items[icp] = N;

180

181

if (N%BLOCK_SIZE) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE);

182

else ps->grid_size[icp] = N / BLOCK_SIZE;

183

184

cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);

185

186

return cudaPtr;

187

}

188

189

static inline mxArray *fftCompute(TProcessingState *ps, int icp, const mxArray *data, float sum, float denom) {

190

uint8_t *dataPtr;

191

double *ar;

192

mxArray *res;

193

194

int size = ps->fft_size;

195

int size2 = size * size;

196

int alloc_size = ps->fft_alloc_size;

197

198

int N = mxGetM(data);

199

int N2 = N * N;

200

201

dim3 input_block_dim(N, 1, 1);

202

dim3 input_grid_dim(N, 1, 1);

203

204

dim3 block_dim(size / 2 + 1, 1, 1);

205

dim3 grid_dim(size, 1, 1);

206

207

uint8_t *cudaInputPtr = ps->cuda_input_buffer + icp * alloc_size;

208

cufftComplex *cudaPtr = ps->cuda_base_buffer + icp * alloc_size;

209

cufftReal *cudaRealPtr = ps->cuda_temp_buffer;

210

cufftComplex *cudaDataPtr = ps->cuda_data_buffer;

211

float *cudaResultPtr = ps->cuda_final_buffer;

212

213

dataPtr = (uint8_t*)mxGetData(data);

214

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

215

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);

216

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaDataPtr);

217

218

vecMul<<<grid_dim,block_dim>>>(cudaDataPtr, cudaPtr, size/2+1);

219

220

cudaRealPtr = ps->cuda_result_buffer;

221

cufftExecC2R(ps->cufft_c2r_plan, cudaDataPtr, cudaRealPtr);

222

223

uint16_t nz_items = ps->cuda_nonzero_items[icp];

224

uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;

225

226

float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;

227

float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;

228

229

int grids;

230

231

// printf("%i (%i %i)\n", nz_items, icp, ps->ncp);

232

233

if (nz_items) {

234

grids = ps->grid_size[icp];

235

236

dim3 output_block_dim(BLOCK_SIZE, 1, 1);

237

dim3 output_grid_dim(grids, 1, 1);

238

239

vecCompute<<<output_grid_dim, output_block_dim>>>(

240

nz, cudaResultPtr,

241

cudaRealPtr, 1./(size2 * (N2 - 1)),

242

cudaLSum, sum / (N2 * (N2 - 1)),

243

cudaDenom, denom

244

);

245

}

246

247

res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

248

ar = mxGetPr(res);

249

250

cudaMemcpy(ar, cudaResultPtr, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);

251

252

return res;

253

}

254

255

256

257

static inline double fftDownloadData(TProcessingState *ps, mxArray *data) {

258

cudaMemcpy( input_single, rhs_complex_d, sizeof(cufftComplex)*N*M, cudaMemcpyDeviceToHost);

259

}

260

261

262

263

static void selfClean() {

264

reportMessage("Unloading normxcorr_hw");

265

266

if (pstate) {

267

pstateFree(pstate);

268

pstate = NULL;

269

}

270

}

271

272

273

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

274

// int i;

275

276

mxArray *idMatrix;

277

uint32_t id, *idPtr;

278

279

TProcessingState *ps;

280

281

TAction action;

282

283

int iprop;

284

285

const mxArray *input;

286

const mxArray *base;

287

const mxArray *lsum;

288

const mxArray *denom;

289

const mxArray *nonzero;

290

291

double input_sum;

292

double input_denom;

293

294

unsigned int icp;

295

296

if (!nrhs) {

297

reportMessage("Initializing normxcorr_hw instance");

298

299

if (nlhs != 1) {

300

reportError("You should accept a single result from initialization call");

301

return;

302

}

303

304

if (pstate) {

305

reportError("Only a single calculation process is supported at the moment");

306

return;

307

}

308

309

310

// Initialising, for now a single client is supported only

311

idMatrix = mxCreateNumericMatrix(1, 1, mxUINT32_CLASS, mxREAL);

312

if (!idMatrix) {

313

reportError("Initialization is failed");

314

return;

315

}

316

317

pstate = pstateInit();

318

if (!pstate) {

319

mxDestroyArray(idMatrix);

320

reportError("State structure initialization is failed");

321

return;

322

}

323

324

id = 1;

325

326

idPtr = (uint32_t*)mxGetData(idMatrix);

327

idPtr[0] = id;

328

329

plhs[0] = idMatrix;

330

331

mexAtExit(selfClean);

332

333

return;

334

} else {

335

/* idMatrix = (mxArray*)prhs[0];

336

if ((mxGetClassID(idMatrix) != mxUINT32_CLASS)||(mxGetM(idMatrix) != 1)||(mxGetN(idMatrix) != 1)) {

337

reportError("Invalid parameter is supplied in place of process identificator");

338

return;

339

}

340

341

idPtr = (uint32_t*)mxGetData(idMatrix);

342

if (!idPtr) {

343

reportError("Mex is not able to obtain process identificator");

344

return;

345

}

346

347

id = *idPtr;

348

if (id != 1) {

349

reportError("Invalid process identificator is supplied");

350

return;

351

}

352

353

if (!pstate) {

354

reportError("The interface is not initialized");

355

return;

356

}*/

357

}

358

359

// Clean request

360

if (nrhs == 1) {

361

reportMessage("Cleaning normxcorr_hw instance");

362

363

pstateFree(pstate);

364

pstate = NULL;

365

return;

366

}

367

368

ps = pstate;

369

370

action = (TAction)mxGetScalar((mxArray*)prhs[1]);

371

372

// reportMessage("Executing normxcorr_hw action: %u", action);

373

374

switch (action) {

375

case ACTION_COMPUTE_FRAGMENT:

376

377

if (nrhs != 6) {

378

reportError("This action expects 4 arguments, but %i is passed", nrhs - 2);

379

return;

380

}

381

382

if (nlhs != 1) {

383

reportError("This action expects single ouput, but %i is passed", nlhs);

384

}

385

386

387

icp = (unsigned int)mxGetScalar(prhs[2]);

388

/* if (icp >= ps->ncp) {

389

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

390

return;

391

}

392

393

input = prhs[3];

394

395

if (mxGetNumberOfDimensions(input) != 2) {

396

reportError("Invalid dimensionality of input matrix, 2D matrix is expected");

397

return;

398

}

399

400

if (mxGetClassID(input) != mxUINT8_CLASS) {

401

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(input));

402

return;

403

}

404

405

406

input_sum = mxGetScalar(prhs[4]);

407

input_denom = mxGetScalar(prhs[5]);

408

409

plhs[0] = fftCompute(ps, icp, input, input_sum, input_denom);

410

break;

411

case ACTION_COMPUTE_BASE:

412

if (nrhs != 7) {

413

reportError("This action expects 5 arguments, but %i is passed", nrhs - 2);

414

return;

415

}

416

417

icp = (unsigned int)mxGetScalar(prhs[2]);

418

if (icp >= ps->ncp) {

419

reportError("The control point (%i) is out of range (0-%u)", icp, ps->ncp - 1);

420

return;

421

}

422

423

base = prhs[3];

424

425

if (mxGetNumberOfDimensions(base) != 2) {

426

reportError("Invalid dimensionality of base matrix, 2D matrix is expected");

427

return;

428

}

429

430

if (mxGetClassID(base) != mxUINT8_CLASS) {

431

reportError("Invalid matrix. The data type (%s) is not supported", mxGetClassName(base));

432

return;

433

}

434

435

lsum = prhs[4];

436

denom = prhs[5];

437

nonzero = prhs[6];

438

439

iprop = ps->fft_size;

440

if (

441

(mxGetNumberOfDimensions(lsum) != 2)||

442

(mxGetNumberOfDimensions(denom) != 2)||

443

(mxGetClassID(lsum) != mxSINGLE_CLASS)||

444

(mxGetClassID(denom) != mxSINGLE_CLASS)||

445

(mxGetClassID(nonzero) != mxUINT16_CLASS)||

446

(mxGetN(lsum) != iprop)||(mxGetM(lsum) != iprop)||

447

(mxGetN(denom) != iprop)||(mxGetM(denom) != iprop)

448

449

) {

450

reportError("Invalid properties for base initialization are specified");

451

return;

452

}

453

454

fftUploadBaseData(ps, icp, base, lsum, denom, nonzero);

455

break;

456

case ACTION_SETUP:

457

if (nrhs != 4) {

458

reportError("SETUP action expects 'ncp' and 'corrsize' parameters");

459

return;

460

}

461

462

iprop = (int)mxGetScalar(prhs[2]);

463

ps->ncp = iprop + 1;

464

465

iprop = (int)mxGetScalar(prhs[3]);

466

ps->fft_size = 6 * iprop + 1;

467

ps->fft_size2 = ps->fft_size * ps->fft_size;

468

ps->fft_inner_size = ps->fft_size * (ps->fft_size / 2 + 1);

469

470

if (ps->fft_size2 % BLOCK_SIZE) {

471

ps->fft_alloc_size = ((ps->fft_size2 / BLOCK_SIZE) + 1) * BLOCK_SIZE;

472

} else {

473

ps->fft_alloc_size = ps->fft_size2;

474

}

475

476

fftInit(ps);

477

break;

478

case ACTION_PREPARE:

479

fftPrepare(ps);

480

break;

481

default:

482

reportError("Unknown request %i", action);

483

}

484

485

}

Older »