/normxcorr/trunk : revision 5

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-02 05:10:57 UTC
Revision ID: csa@dside.dyndns.org-20091202051057-ratvvyegrglzov4u

Removal of non-zero comments

files modified:
cuda/normxcorr_hw.cu

cuda/normxcorr_hw.h

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

static void fftFree(TProcessingState *ps) {

if (ps->cuda_base_buffer) {

free(ps->grid_size);

free(ps->cuda_nonzero_items);

cudaFree(ps->cuda_lsum_temp);

cudaFree(ps->cuda_lsum_buffer);

cudaFree(ps->cuda_denom_buffer);

cudaFree(ps->cuda_nonzero_buffer);

cudaFree(ps->cuda_temp_buffer);

cudaFree(ps->cuda_final_buffer);

156

149

return ERROR_CUDA_MALLOC;

157

150

}

158

151

159

160

cuda_err = cudaMalloc((void**)&ps->cuda_nonzero_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

161

if (cuda_err) {

162

reportError("Device memory allocation of %u*%u*uint16 bytes for cuda_nonzero_buffer is failed", ps->ncp, ps->fft_alloc_size);

163

fftFree(ps);

164

return ERROR_CUDA_MALLOC;

165

}

166

cudaMemset((void*)ps->cuda_nonzero_buffer, 0, ps->ncp * ps->fft_alloc_size * sizeof(uint16_t));

167

168

ps->cuda_nonzero_items = (uint16_t*)malloc(ps->ncp * sizeof(uint16_t));

169

if (!ps->cuda_nonzero_items) {

170

reportError("Host memory allocation of %u*uint16 bytes for cuda_nonzero_items is failed", ps->ncp);

171

fftFree(ps);

172

return ERROR_MALLOC;

173

}

174

175

ps->grid_size = (int*)malloc(ps->ncp*sizeof(int));

176

if (!ps->grid_size) {

177

reportError("Host memory allocation of %u*int bytes for grid_size is failed", ps->ncp);

178

fftFree(ps);

179

return ERROR_MALLOC;

180

}

181

182

183

152

cuda_err = cudaMalloc((void**)&ps->cuda_lsum_temp, 4 * lsum_alloc_size2 * sizeof(float));

184

153

if (cuda_err) {

185

154

reportError("Device memory allocation of 4*%u*float bytes for lsum temporary buffer is failed", lsum_alloc_size2);

249

218

dataPtr = (uint8_t*)mxGetData(data);

250

219

cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);

251

220

252

253

int size2 = size*size;

254

255

This is a memory copy based code to fill the lsum and denom buffers

256

vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);

257

258

// Loading various stuff

259

cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);

260

cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);

261

262

N = mxGetM(nonzero);

263

264

ps->cuda_nonzero_items[icp] = N;

265

266

if (N%BLOCK_SIZE_1D) ps->grid_size[icp] = 1 + (N / BLOCK_SIZE_1D);

267

else ps->grid_size[icp] = N / BLOCK_SIZE_1D;

268

269

cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);

270

271

272

273

221

float *lsum_temp = ps->cuda_lsum_temp;

274

222

int step = ps->lsum_alloc_size * ps->lsum_alloc_size;

275

223

282

230

lsum_temp, lsum_temp + step, ps->lsum_alloc_size, ps->lsum_size

283

231

);

284

232

285

/* In general we should expect non-zero denominals, therefore the Nonzero array is not computed */

233

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

286

234

local_sum(ps,

287

235

ps->cuda_lsum_buffer + icp * alloc_size, ps->cuda_denom_buffer + icp * alloc_size,

288

236

lsum_temp + (2 * step), lsum_temp + (3 * step),

334

282

float *cudaDenom = ps->cuda_denom_buffer + icp*alloc_size;

335

283

float *cudaLSum = ps->cuda_lsum_buffer + icp*alloc_size;

336

284

337

338

if (nz_items) {

339

int grids = ps->grid_size[icp];

340

uint16_t nz_items = ps->cuda_nonzero_items[icp];

341

uint16_t *nz = ps->cuda_nonzero_buffer + icp*alloc_size;

342

343

dim3 output_block_dim(BLOCK_SIZE_1D, 1, 1);

344

dim3 output_grid_dim(grids, 1, 1);

345

346

347

vecCompute<<<output_grid_dim, output_block_dim>>>(

348

nz, cudaResultPtr,

349

cudaRealPtr, 1./(size2 * (N2 - 1)),

350

cudaLSum, sum / (N2 * (N2 - 1)),

351

cudaDenom, denom

352

);

353

}

354

355

356

vecCompute<<<output_grid_dim, output_block_dim>>>(

285

vecCompute<<<output_grid_dim, output_block_dim>>>(

357

286

cudaResultPtr,

358

287

cudaRealPtr, 1./(size2 * (N2 - 1)),

359

288

cudaLSum, sum / (N2 * (N2 - 1)),

360

289

cudaDenom, denom,

361

290

size

362

);

291

);

363

292

364

293

res = mxCreateNumericMatrix(size, size, mxSINGLE_CLASS, mxREAL);

365

294

ar = mxGetPr(res);

Older »