/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to dict_hw/src/local_sum.cu.h

  • Committer: Suren A. Chilingaryan
  • Date: 2010-04-22 13:42:41 UTC
  • Revision ID: csa@dside.dyndns.org-20100422134241-fv5m2ufk8n2tc9h5
Implementation of image and fragment modes, support for non-cacheable grids

Show diffs side-by-side

added added

removed removed

Lines of Context:
5
5
static inline int local_sum(TProcessingState *ps, 
6
6
    float *lsum, float *denom,
7
7
    float *tmp1, float *tmp2,
8
 
    float *in1, float *in2,
9
 
    cudaStream_t stream) {
 
8
    float *in1, float *in2) {
10
9
 
11
10
    int size = ps->subimage_size;
12
11
    int lsize = ps->lsum_size;
34
33
        // as is for simplicity
35
34
    dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
36
35
    dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
37
 
    transpose1<<<grid_dim1,block_dim,0,stream>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
38
 
    transpose1<<<grid_dim1,block_dim,0,stream>>>(denom, tmp2, alloc_size, alloc_size, lsize);
 
36
    transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
 
37
    transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
39
38
 
40
39
    cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
41
40
    cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
42
41
 
43
42
    dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
44
 
    transpose2<<<grid_dim2,block_dim,0,stream>>>(lsum, denom, tmp1, tmp2, alloc_size, fft_size, lsize);
 
43
    transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, fft_size, lsize);
45
44
 
46
45
    return 0;
47
46
}