318
dim3 input_block_dim(size, 1, 1);
319
dim3 input_grid_dim(size, 1, 1);
321
319
cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
322
320
cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;
323
321
float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
324
322
float *denom_cache = ps->cuda_denom_cache + icp * alloc_size;
324
int blocks = calc_blocks(size, BLOCK_SIZE_1D);
325
int base_blocks = blocks * blocks * BLOCK_SIZE_1D;
328
if ((blocks&(blocks-1))) {
331
blocks_power = debruijn[((uint32_t)blocks * 0x077CB531) >> 27];
334
int lsum_size = ps->lsum_size;
335
int lsum_alloc = ps->lsum_alloc_size;
326
337
for (int i = 0;i < ncp;i++) {
327
338
if (banlist[i]) continue;
329
vecPackBase<<<input_grid_dim, input_block_dim>>>(
330
cuda_input_buffer + i * side_alloc2, side_alloc,
331
cuda_base_buffer, fft_size,
332
lsum_temp, lsum_temp + lsum_step, ps->lsum_alloc_size, ps->lsum_size
340
if (blocks_power < 0) {
341
vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(
342
cuda_input_buffer + i * side_alloc2, side_alloc,
343
cuda_base_buffer, fft_size,
344
lsum_temp + lsum_size * (lsum_alloc + 1),
345
lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
350
vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D>>>(
351
cuda_input_buffer + i * side_alloc2, side_alloc,
352
cuda_base_buffer, fft_size,
353
lsum_temp + lsum_size * (lsum_alloc + 1),
354
lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
335
360
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed