/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/local_sum.cu

  • Committer: Suren A. Chilingaryan
  • Date: 2009-12-10 13:09:44 UTC
  • Revision ID: csa@dside.dyndns.org-20091210130944-cppzkxvshdvg03ig
First attempt with CUDA streams

Show diffs side-by-side

added added

removed removed

Lines of Context:
13
13
int local_sum(TProcessingState *ps, 
14
14
    float *lsum, float *denom,
15
15
    float *tmp1, float *tmp2,
16
 
    float *in1, float *in2) {
 
16
    float *in1, float *in2,
 
17
    cudaStream_t stream) {
17
18
 
18
19
    int size = ps->subimage_size;
19
20
    int lsize = ps->lsum_size;
39
40
        // as is for simplicity
40
41
    dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
41
42
    dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
42
 
    transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
43
 
    transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
 
43
    transpose1<<<grid_dim1,block_dim,0,stream>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
 
44
    transpose1<<<grid_dim1,block_dim,0,stream>>>(denom, tmp2, alloc_size, alloc_size, lsize);
44
45
 
45
46
    cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
46
47
    cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
47
48
 
48
49
    dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
49
 
    transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);
 
50
    transpose2<<<grid_dim2,block_dim,0,stream>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);
50
51
 
51
52
    return 0;
52
53
}