155
155
return ERROR_CUDA_MALLOC;
158
cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, ps->fft_alloc_size * sizeof(cufftReal));
158
cuda_err = cudaMalloc((void**)&ps->cuda_base_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
160
160
reportError("Device memory allocation of %u*cufftReal bytes for cuda_base_buffer is failed", ps->fft_alloc_size);
162
162
return ERROR_CUDA_MALLOC;
164
cudaMemset((void*)ps->cuda_base_buffer, 0, ps->fft_alloc_size * sizeof(cufftReal));
164
cudaMemset((void*)ps->cuda_base_buffer, 0, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
166
166
cuda_err = cudaMalloc((void**)&ps->cuda_data_buffer, CP_BLOCK * ps->fft_alloc_size * sizeof(cufftReal));
273
273
uint8_t *cuda_input_buffer = ps->cuda_input_buffer;
275
for (int i = 0;i < ncp;i++) {
276
float x = data_x[i] - 1;
277
float y = data_y[i] - 1;
279
frac_x[i] = x - round(x * precision) / precision;
280
frac_y[i] = y - round(y * precision) / precision;
282
int xstart = roundf(x) - half_size;
283
int ystart = roundf(y) - half_size;
285
int xend = xstart + size;
286
int yend = xstart + size;
288
if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
293
if (xstart < minx) minx = xstart;
294
if (ystart < miny) miny = ystart;
295
if (xend > maxx) maxx = xend;
296
if (yend > maxy) maxy = yend;
300
img + i * alloc_size,
301
size * sizeof(uint8_t),
302
fullimg + (xstart * height + ystart),
303
height * sizeof(uint8_t),
304
size * sizeof(uint8_t),
310
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
311
img + i * alloc_size, size * sizeof(uint8_t),
312
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
319
274
cufftReal *cuda_base_buffer = ps->cuda_base_buffer;
320
275
cufftComplex *cache = ps->cuda_fft_cache + icp * alloc_size;
321
276
float *lsum_cache = ps->cuda_lsum_cache + icp * alloc_size;
334
289
int lsum_size = ps->lsum_size;
335
290
int lsum_alloc = ps->lsum_alloc_size;
337
for (int i = 0;i < ncp;i++) {
338
if (banlist[i]) continue;
292
cudaStream_t stream[2];
293
for (int i = 0; i < 2; ++i) {
294
cudaStreamCreate(&stream[i]);
297
for (int i = 0;i <= ncp;i++) {
299
float x = data_x[i] - 1;
300
float y = data_y[i] - 1;
302
frac_x[i] = x - round(x * precision) / precision;
303
frac_y[i] = y - round(y * precision) / precision;
305
int xstart = roundf(x) - half_size;
306
int ystart = roundf(y) - half_size;
308
int xend = xstart + size;
309
int yend = xstart + size;
311
if ((xstart < 0)||(ystart < 0)||(xend >= width)||(yend >= height)) {
316
if (xstart < minx) minx = xstart;
317
if (ystart < miny) miny = ystart;
318
if (xend > maxx) maxx = xend;
319
if (yend > maxy) maxy = yend;
323
img + i * alloc_size,
324
size * sizeof(uint8_t),
325
fullimg + (xstart * height + ystart),
326
height * sizeof(uint8_t),
327
size * sizeof(uint8_t),
333
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
334
img + i * alloc_size, size * sizeof(uint8_t),
335
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice,
340
344
if (blocks_power < 0) {
341
vecBasePack<<<base_blocks, BLOCK_SIZE_1D>>>(
342
cuda_input_buffer + i * side_alloc2, side_alloc,
343
cuda_base_buffer, fft_size,
345
vecBasePack<<<base_blocks, BLOCK_SIZE_1D, 0, stream[j%2]>>>(
346
cuda_input_buffer + j * side_alloc2, side_alloc,
347
cuda_base_buffer + j*alloc_size, fft_size,
344
348
lsum_temp + lsum_size * (lsum_alloc + 1),
345
349
lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
350
vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D>>>(
351
cuda_input_buffer + i * side_alloc2, side_alloc,
352
cuda_base_buffer, fft_size,
354
vecBasePackFast<<<base_blocks, BLOCK_SIZE_1D, stream[j%2]>>>(
355
cuda_input_buffer + j * side_alloc2, side_alloc,
356
cuda_base_buffer + j*alloc_size, fft_size,
353
357
lsum_temp + lsum_size * (lsum_alloc + 1),
354
358
lsum_temp + lsum_step + lsum_size * (lsum_alloc + 1),
360
364
// In general we should expect non-zero denominals, therefore the Nonzero array is not computed
362
lsum_cache + i * alloc_size, denom_cache + i * alloc_size,
366
lsum_cache + j * alloc_size, denom_cache + j * alloc_size,
363
367
lsum_temp + (2 * lsum_step), lsum_temp + (3 * lsum_step),
364
lsum_temp, lsum_temp + lsum_step);
366
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + i * alloc_size);
368
lsum_temp, lsum_temp + lsum_step,
371
// cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer, cache + j * alloc_size);
375
for (int j = 0;j < ncp;j++) {
376
cufftExecR2C(ps->cufft_r2c_plan, cuda_base_buffer + j * alloc_size, cache + j * alloc_size);
379
for (int i = 0; i < 2; ++i) {
380
cudaStreamDestroy(stream[i]);
369
385
if (check_mode) {