110
110
(1 + CP_BLOCK * ps->fft_alloc_size) * sizeof(cufftComplex), /* FFT multiplication */
111
111
2 * CP_BLOCK * ps->side_alloc_size * sizeof(int32_t), /* Sum, Std computations */
112
CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)), /* Max of correlation */
112
CP_BLOCK * ps->side_alloc_size * (sizeof(int32_t) + sizeof(float)) /* Max of correlation */
115
115
cuda_err = cudaMalloc((void**)&ps->cuda_temp_buffer, size);
117
reportError("Device memory allocation of %u*cufftComplex bytes for cuda_temp_buffer is failed", ps->fft_alloc_size);
117
reportError("Device memory allocation of %u bytes for cuda_temp_buffer is failed", size);
119
119
return ERROR_CUDA_MALLOC;
122
122
ps->banlist = (uint8_t*)malloc(ps->ncp * sizeof(uint8_t));
123
123
if (!ps->banlist) {
124
reportError("Host memory allocation of %u*float bytes for banlist of control points is failed", ps->ncp);
124
reportError("Host memory allocation of %u*uint8 bytes for banlist of control points is failed", ps->ncp);
126
126
return ERROR_MALLOC;
137
137
cuda_err = cudaMalloc((void**)&ps->cuda_points, 2 * ps->ncp_alloc_size * sizeof(float));
139
reportError("Device memory allocation of %u*%u*float bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
141
return ERROR_CUDA_MALLOC;
144
cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, ps->ncp * side_alloc_size2 * sizeof(uint8_t));
146
reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", ps->ncp, side_alloc_size2);
148
return ERROR_CUDA_MALLOC;
151
cuda_err = cudaHostAlloc((void**)&ps->input_buffer, ps->ncp * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
153
reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", ps->ncp, ps->fft_alloc_size);
139
reportError("Device memory allocation of 2*%u*float bytes for cuda_input_buffer is failed", ps->ncp_alloc_size);
141
return ERROR_CUDA_MALLOC;
144
cuda_err = cudaMalloc((void**)&ps->cuda_input_buffer, CP_BLOCK * side_alloc_size2 * sizeof(uint8_t));
146
reportError("Device memory allocation of %u*%u*uint8 bytes for cuda_input_buffer is failed", CP_BLOCK, side_alloc_size2);
148
return ERROR_CUDA_MALLOC;
151
cuda_err = cudaHostAlloc((void**)&ps->input_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(uint8_t), cudaHostAllocWriteCombined);
153
reportError("Host memory allocation of %u*%u*uint8 bytes for input_buffer is failed", CP_BLOCK, ps->fft_alloc_size);
155
155
return ERROR_CUDA_MALLOC;
263
263
float *lsum_temp = (float*)ps->cuda_lsum_temp;
264
264
int lsum_step = ps->lsum_alloc_size * ps->lsum_alloc_size;
266
dim3 input_block_dim(size, 1, 1);
267
dim3 input_grid_dim(size, 1, 1);
269
cufftReal *cudaRealPtr = ps->cuda_base_buffer;
271
266
if (check_mode) {
309
306
cudaMemcpyHostToHost
312
// Somehow check for constancy
313
// sum(sub_base(:)) == sub_base(1)*numel(sub_base)
314
// The values of TEMPLATE cannot all be the same
316
uint8_t *cudaInputPtr = ps->cuda_input_buffer + (icp + i) * side_alloc2;
317
cufftComplex *cudaPtr = ps->cuda_fft_cache + (icp + i) * alloc_size;
320
cudaInputPtr, side_alloc * sizeof(uint8_t),
310
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
321
311
img, size * sizeof(uint8_t),
322
312
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
318
dim3 input_block_dim(size, 1, 1);
319
dim3 input_grid_dim(size, 1, 1);
321
cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
322
cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;
323
float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
324
float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;
326
for (int i = 0;i < ncp;i++) {
327
if (banlist[i]) continue;
326
329
vecPackBase<<<input_grid_dim, input_block_dim>>>(
327
cudaInputPtr, side_alloc,
328
cudaRealPtr, fft_size,
330
cuda_input_buffer + i * side_alloc2, side_alloc,
331
cuda_base_buffer, fft_size,
329
332
lsum_temp, lsum_temp + lsum_step, ps->lsum_alloc_size, ps->lsum_size
332
335
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
334
ps->cuda_lsum_cache + (icp + i) * alloc_size, ps->cuda_denom_cache + (icp + i) * alloc_size,
337
lsum_cache + i * alloc_size, denom_cache + i * alloc_size,
335
338
lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
336
339
lsum_temp, lsum_temp + lsum_step);
338
cufftExecR2C(ps->cufft_r2c_plan, cudaRealPtr, cudaPtr);
341
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + i * alloc_size);
343
344
if (check_mode) {