1
__global__ void transpose1(float *dst, float *src, int width, int height, int step) {
2
__shared__ float data[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
4
int dx = blockIdx.x * blockDim.x;
5
int dy = blockIdx.y * blockDim.y;
7
int x = threadIdx.x + dx;
8
int y = threadIdx.y + dy;
10
data[threadIdx.x][threadIdx.y] =
11
src[x + step + y*width] - src[x + y*width];
18
dst[x + y*height] = data[threadIdx.y][threadIdx.x];
22
__global__ void transpose2(float *lsum, float *denom, float *src1, float *src2, int width, int height, int step) {
23
__shared__ float data1[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
24
__shared__ float data2[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
26
int dx = blockIdx.x * blockDim.x;
27
int dy = blockIdx.y * blockDim.y;
29
int x = threadIdx.x + dx;
30
int y = threadIdx.y + dy;
32
data1[threadIdx.x][threadIdx.y] =
33
src1[x + step + y*width] - src1[x + y*width];
35
data2[threadIdx.x][threadIdx.y] =
36
src2[x + step + y*width] - src2[x + y*width];
43
float val1 = data1[threadIdx.y][threadIdx.x];
44
float val2 = data2[threadIdx.y][threadIdx.x];
45
float coef = step * step;
48
lsum[x + y*height] = val1;
49
denom[x + y*height] = sqrtf(fmaxf((val2 - val1*val1/coef)/(coef - 1) ,0));