310
310
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
311
img, size * sizeof(uint8_t),
311
img + i * alloc_size, size * sizeof(uint8_t),
312
312
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
429
img + i * size2,//alloc_size,
430
430
size * sizeof(uint8_t),
431
431
fullimg + (xstart * height + ystart),
432
432
height * sizeof(uint8_t),
435
435
cudaMemcpyHostToHost
439
439
cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
440
img, size * sizeof(uint8_t),
440
img + i * size2, size * sizeof(uint8_t),
441
441
size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
446
cudaMemcpy3DParms copy_params = { 0 };
447
copy_params.dstPtr = make_cudaPitchedPtr(
448
cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
450
copy_params.srcPtr = make_cudaPitchedPtr(
451
img, size * sizeof(uint8_t), size, size
453
copy_params.extent = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
454
copy_params.kind = cudaMemcpyHostToDevice;
455
cudaMemcpy3D(©_params);
445
458
dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
446
459
dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
448
//int input_blocks = calc_blocks(size2, BLOCK_SIZE_2D);
451
461
int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
452
462
int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
453
463
int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
454
464
int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
455
465
int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
458
467
// Computing sum and std
459
468
int32_t *stat_buf = (int*)ps->cuda_temp_buffer;