5
5
static inline int local_sum(TProcessingState *ps,
6
6
float *lsum, float *denom,
7
7
float *tmp1, float *tmp2,
8
float *in1, float *in2,
8
float *in1, float *in2) {
11
10
int size = ps->subimage_size;
12
11
int lsize = ps->lsum_size;
34
33
// as is for simplicity
35
34
dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
36
35
dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
37
transpose1<<<grid_dim1,block_dim,0,stream>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
38
transpose1<<<grid_dim1,block_dim,0,stream>>>(denom, tmp2, alloc_size, alloc_size, lsize);
36
transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
37
transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
40
39
cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
41
40
cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
43
42
dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
44
transpose2<<<grid_dim2,block_dim,0,stream>>>(lsum, denom, tmp1, tmp2, alloc_size, fft_size, lsize);
43
transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, fft_size, lsize);