/normxcorr/trunk : revision 12

To get this branch, use:

bzr branch
http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

Committer: Suren A. Chilingaryan
Date: 2009-12-10 04:03:42 UTC
Revision ID: csa@dside.dyndns.org-20091210040342-w07qnwn3tog4q0xj

Reduce memory allocation

files modified:
cuda/normxcorr_hw.cu

Show diffs side-by-side

added added

removed removed

cuda/normxcorr_hw.cu

109

size = max3(

110

(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */

111

2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */

112

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)), /* Max of correlation */

112

CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */

113

);

114

115

cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);

116

if (cuda_err) {

117

reportError("Device memory allocation of %u*cufftComplex bytes for cuda_temp_buffer is failed", ps->fft_alloc_size);

117

reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);

118

fftFree(ps);

119

return ERROR_CUDA_MALLOC;

120

}

121

122

ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));

123

if (!ps->banlist) {

124

reportError("Host memory allocation of %u*float bytes for banlist of control points is failed", ps->ncp);

124

reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);

125

fftFree(ps);

126

return ERROR_MALLOC;

127

}

136

137

cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));

138

if (cuda_err) {

139

reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

140

fftFree(ps);

141

return ERROR_CUDA_MALLOC;

142

}

143

144

cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * side_alloc_size2 * sizeof(uint8_t));

145

if (cuda_err) {

146

reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);

147

fftFree(ps);

148

return ERROR_CUDA_MALLOC;

149

}

150

151

cuda_err = cudaHostAlloc((void**)&ps->input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);

152

if (cuda_err) {

153

reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", ps->ncp, ps->fft_alloc_size);

139

reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);

140

fftFree(ps);

141

return ERROR_CUDA_MALLOC;

142

}

143

144

cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));

145

if (cuda_err) {

146

reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);

147

fftFree(ps);

148

return ERROR_CUDA_MALLOC;

149

}

150

151

cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);

152

if (cuda_err) {

153

reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);

154

fftFree(ps);

155

return ERROR_CUDA_MALLOC;

156

}

263

float *lsum_temp = (float*)ps->cuda_lsum_temp;

264

int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;

265

266

dim3 input_block_dim(size, 1, 1);

267

dim3 input_grid_dim(size, 1, 1);

268

269

cufftReal *cudaRealPtr = ps->cuda_base_buffer;

270

271

266

if (check_mode) {

272

267

minx = ps->minx;

273

268

maxx = ps->maxx;

275

270

maxy = ps->maxy;

276

271

}

277

272

273

uint8_t *cuda_input_buffer = ps->cuda_input_buffer;

274

278

275

for (int i = 0;i < ncp;i++) {

279

276

float x = data_x[i] - 1;

280

277

float y = data_y[i] - 1;

309

306

cudaMemcpyHostToHost

310

307

);

311

308

312

// Somehow check for constancy

313

// sum(sub_base(:)) == sub_base(1)*numel(sub_base)

314

// The values of TEMPLATE cannot all be the same

315

316

uint8_t *cudaInputPtr = ps->cuda_input_buffer + (icp + i) * side_alloc2;

317

cufftComplex *cudaPtr = ps->cuda_fft_cache + (icp + i) * alloc_size;

318

319

309

cudaMemcpy2D(

320

cudaInputPtr, side_alloc * sizeof(uint8_t),

310

cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),

321

311

img, size * sizeof(uint8_t),

322

312

size * sizeof(uint8_t), size, cudaMemcpyHostToDevice

323

313

);

324

314

325

315

banlist[i] = 0;

316

}

317

318

dim3 input_block_dim(size, 1, 1);

319

dim3 input_grid_dim(size, 1, 1);

320

321

cufftReal *cuda_base_buffer = ps->cuda_base_buffer;

322

cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;

323

float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;

324

float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;

325

326

for (int i = 0;i < ncp;i++) {

327

if (banlist[i]) continue;

328

326

329

vecPackBase<<<input_grid_dim, input_block_dim>>>(

327

cudaInputPtr, side_alloc,

328

cudaRealPtr, fft_size,

330

cuda_input_buffer + i * side_alloc2, side_alloc,

331

cuda_base_buffer, fft_size,

329

332

lsum_temp, lsum_temp + lsum_step, ps->lsum_alloc_size, ps->lsum_size

330

333

);

331

334

332

335

// In general we should expect non-zero denominals, therefore the Nonzero array is not computed

333

336

local_sum(ps,

334

ps->cuda_lsum_cache + (icp + i) * alloc_size, ps->cuda_denom_cache + (icp + i) * alloc_size,

337

lsum_cache + i * alloc_size, denom_cache + i * alloc_size,

335

338

lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),

336

339

lsum_temp, lsum_temp + lsum_step);

337

340

338

cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);

339

340

banlist[i] = 0;

341

cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + i * alloc_size);

341

342

}

342

343

344

if (check_mode) {

Older »