13
13
int local_sum(TProcessingState *ps,
14
14
float *lsum, float *denom,
15
15
float *tmp1, float *tmp2,
16
float *in1, float *in2) {
16
float *in1, float *in2,
17
cudaStream_t stream) {
18
19
int size = ps->subimage_size;
19
20
int lsize = ps->lsum_size;
39
40
// as is for simplicity
40
41
dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
41
42
dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
42
transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
43
transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
43
transpose1<<<grid_dim1,block_dim,0,stream>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
44
transpose1<<<grid_dim1,block_dim,0,stream>>>(denom, tmp2, alloc_size, alloc_size, lsize);
45
46
cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
46
47
cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
48
49
dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
49
transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);
50
transpose2<<<grid_dim2,block_dim,0,stream>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);