1
static __global__ void vecMul(cuComplex *a, cuComplex *b, int size) {
1
static __global__ void vecMul(cuComplex *a, cuComplex *b, int pitch, int size) {
4
int i = threadIdx.x + blockIdx.x*size;
4
int point = blockIdx.y * blockDim.y + threadIdx.y;
5
int i = threadIdx.x + blockIdx.x * blockDim.x + point*pitch;
6
7
tmp = a[i].x * b[i].x - a[i].y * b[i].y;
7
8
a[i].y = a[i].x * b[i].y + a[i].y * b[i].x;
12
static __global__ void vecPack(uint8_t *b, int bsize, cufftReal *a, int asize, int size) {
13
// int i = threadIdx.x + blockIdx.x*bsize;
14
// int i = bsize - threadIdx.x - 1 + (bsize - blockIdx.x - 1)*bsize;
16
// Invalid memory access (possibly)
17
// int i = (bsize - blockIdx.x)*bsize - threadIdx.x - 1;
18
int i = size - threadIdx.x - 1 + (size - blockIdx.x - 1)*bsize;
19
a[threadIdx.x + asize*blockIdx.x] = b[i];
12
static __global__ void vecPack(uint8_t *b, int bpitch, int bsize, cufftReal *a, int apitch, int asize, int size, int blocks_size) {
13
// Includes rotation on 180 grad
15
int point = blockIdx.y * blockDim.y + threadIdx.y;
17
//__fdiv_rz(pos / bsize)
18
int y = __float2int_rz(__fdividef(blockIdx.x, blocks_size));
19
int x = (blockIdx.x - y * blocks_size) * blockDim.x + threadIdx.x ;
22
int pos = blockIdx.x * blockDim.x + threadIdx.x;
23
int y = pos / (blocks_size * blockDim.x);
24
int x = pos - (y * blocks_size * blockDim.x);
27
if ((x < size)&&(y < size)) {
28
int i = (size - y - 1)*bsize + size - x - 1;
29
a[point * apitch + y * asize + x] = b[point * bpitch + i];
34
static __global__ void vecPackFast(uint8_t *b, int bpitch, int bsize, cufftReal *a, int apitch, int asize, int size, int blocks_shift) {
35
__shared__ float data[CP_BLOCK_SIZE][SIDE_BLOCK_SIZE + 1];
37
int point = blockIdx.y * blockDim.y + threadIdx.y;
39
int y = blockIdx.x>>blocks_shift;
40
int bx = (blockIdx.x - (y<<blocks_shift)) * blockDim.x ;
41
int x = bx + threadIdx.x;
43
// int x = (blockIdx.x - (y<<blocks_shift)) * blockDim.x + threadIdx.x ;
45
// threadIdx.x depends only on x
46
data[threadIdx.y][threadIdx.x] = b[point * bpitch + y * bsize + x];
50
int pos = size - bx - blockDim.x + threadIdx.x;
52
if ((pos>=0)&&(y < size)) {
53
a[point * apitch + (size - y - 1) * asize + pos] =
54
data[threadIdx.y][blockDim.x - threadIdx.x - 1];
57
// if ((x < size)&&(y < size)) {
58
// int i = (size - y - 1)*bsize + size - x - 1;
59
// a[point * apitch + y * asize + x] = b[point * bpitch + i];
22
65
static __global__ void vecPackBase(
23
66
uint8_t *b, int bsize,
87
130
cufftReal *corr, float corr_scale,
88
131
float *lsum, float *lsum_scale_ptr, float lsum_mult,
89
132
float *denom, float *denom_scale_ptr,
92
int pos = threadIdx.x + blockIdx.x*size;
94
float lsum_scale = (*lsum_scale_ptr) * lsum_mult;
95
float denom_scale = (*denom_scale_ptr);
135
// int pos = threadIdx.x + blockIdx.x*size;
137
int point = blockIdx.y * blockDim.y + threadIdx.y;
138
int pos = threadIdx.x + blockIdx.x * blockDim.x + point * pitch;
141
float lsum_scale = lsum_scale_ptr[point] * lsum_mult;
142
float denom_scale = denom_scale_ptr[point];
97
144
if (denom[pos]&&denom_scale) {
98
145
res[pos] = (corr[pos] * corr_scale - lsum[pos]*lsum_scale) / (denom[pos] * denom_scale);