/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/local_sum_kernel.cu

  • Committer: Suren A. Chilingaryan
  • Date: 2009-12-10 03:15:47 UTC
  • Revision ID: csa@dside.dyndns.org-20091210031547-sdiuf234u3gqfyi4
Avoid bank conflicts in local sum kernels

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
__global__ void transpose1(float *dst, float *src, int width, int height, int step) {
2
 
    __shared__ float data[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
 
2
        // + 1 is to resolve bank conflict: Sequential threads within the same 
 
3
        // warp are accessing locations that are exactly BLOCK_SIZE apart, which 
 
4
        // will cause bank conflicts if BLOCK_SIZE is a multiple of 16
 
5
    __shared__ float data[BLOCK_SIZE_2D][BLOCK_SIZE_2D + 1];
3
6
 
4
7
    int dx = blockIdx.x * blockDim.x;
5
8
    int dy = blockIdx.y * blockDim.y;
20
23
 
21
24
 
22
25
__global__ void transpose2(float *lsum, float *denom, float *src1, float *src2, int width, int height, int step) {
23
 
    __shared__ float data1[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
24
 
    __shared__ float data2[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
 
26
    __shared__ float data1[BLOCK_SIZE_2D][BLOCK_SIZE_2D + 1];
 
27
    __shared__ float data2[BLOCK_SIZE_2D][BLOCK_SIZE_2D + 1];
25
28
 
26
29
    int dx = blockIdx.x * blockDim.x;
27
30
    int dy = blockIdx.y * blockDim.y;