diff options
Diffstat (limited to 'cuda/2d')
| -rw-r--r-- | cuda/2d/algo.cu | 10 | ||||
| -rw-r--r-- | cuda/2d/arith.cu | 182 | ||||
| -rw-r--r-- | cuda/2d/arith.h | 37 | ||||
| -rw-r--r-- | cuda/2d/astra.cu | 44 | ||||
| -rw-r--r-- | cuda/2d/cgls.cu | 52 | ||||
| -rw-r--r-- | cuda/2d/darthelper.cu | 62 | ||||
| -rw-r--r-- | cuda/2d/dataop.cu | 16 | ||||
| -rw-r--r-- | cuda/2d/em.cu | 44 | ||||
| -rw-r--r-- | cuda/2d/fan_bp.cu | 28 | ||||
| -rw-r--r-- | cuda/2d/fan_fp.cu | 26 | ||||
| -rw-r--r-- | cuda/2d/fft.cu | 8 | ||||
| -rw-r--r-- | cuda/2d/fft.h | 4 | ||||
| -rw-r--r-- | cuda/2d/par_bp.cu | 28 | ||||
| -rw-r--r-- | cuda/2d/par_fp.cu | 38 | ||||
| -rw-r--r-- | cuda/2d/sart.cu | 60 | ||||
| -rw-r--r-- | cuda/2d/sirt.cu | 72 | ||||
| -rw-r--r-- | cuda/2d/util.cu | 46 | ||||
| -rw-r--r-- | cuda/2d/util.h | 4 | 
18 files changed, 365 insertions, 396 deletions
| diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu index f04607f..71cbfb3 100644 --- a/cuda/2d/algo.cu +++ b/cuda/2d/algo.cu @@ -214,11 +214,11 @@ bool ReconAlgo::setMaxConstraint(float fMax)  bool ReconAlgo::allocateBuffers()  {  	bool ok; -	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	if (!ok)  		return false; -	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); +	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);  	if (!ok) {  		cudaFree(D_volumeData);  		D_volumeData = 0; @@ -226,7 +226,7 @@ bool ReconAlgo::allocateBuffers()  	}  	if (useVolumeMask) { -		ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch); +		ok = allocateVolume(D_maskData, dims.iVolWidth, dims.iVolHeight, maskPitch);  		if (!ok) {  			cudaFree(D_volumeData);  			cudaFree(D_sinoData); @@ -237,7 +237,7 @@ bool ReconAlgo::allocateBuffers()  	}  	if (useSinogramMask) { -		ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch); +		ok = allocateVolume(D_smaskData, dims.iProjDets, dims.iProjAngles, smaskPitch);  		if (!ok) {  			cudaFree(D_volumeData);  			cudaFree(D_sinoData); @@ -271,7 +271,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit  		return false;  	// rescale sinogram to adjust for pixel size -	processVol<opMul,SINO>(D_sinoData, fSinogramScale, +	processVol<opMul>(D_sinoData, fSinogramScale,  	                       //1.0f/(fPixelSize*fPixelSize),  	                       sinoPitch,  	                       dims.iProjDets, dims.iProjAngles); diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index 1ee02ca..42c2c98 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -144,14 +144,14 @@ struct opMulMask { -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns  	}  } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned  	}  } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns  	}  } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned  	}  } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u  	}  } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat>  __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return;  	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; -	unsigned int off = (y+padY)*pitch+x+padX; +	unsigned int off = y*pitch+x;  	for (unsigned int i = 0; i < repeat; ++i) {  		if (y >= height)  			break; @@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, unsigned int width, unsigned int height)  {  	float* D_out;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	processVol<op, t>(D_out, pitch, width, height); +	processVol<op>(D_out, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch);  	cudaFree(D_out);  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, float param, unsigned int width, unsigned int height)  {  	float* D_out;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	processVol<op, t>(D_out, param, pitch, width, height); +	processVol<op>(D_out, param, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch);  	cudaFree(D_out);  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)  {  	float* D_out1;  	float* D_out2;  	unsigned int pitch; -	allocateVolume(D_out1, width+2, height+2, pitch); +	allocateVolume(D_out1, width, height, pitch);  	copyVolumeToDevice(out1, width, width, height, D_out1, pitch); -	allocateVolume(D_out2, width+2, height+2, pitch); +	allocateVolume(D_out2, width, height, pitch);  	copyVolumeToDevice(out2, width, width, height, D_out2, pitch); -	processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height); +	processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height);  	copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);  	copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); @@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)  {  	float* D_out;  	float* D_in;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	allocateVolume(D_in, width+2, height+2, pitch); +	allocateVolume(D_in, width, height, pitch);  	copyVolumeToDevice(in, width, width, height, D_in, pitch); -	processVol<op, t>(D_out, D_in, pitch, width, height); +	processVol<op>(D_out, D_in, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in  	cudaFree(D_in);  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)  {  	float* D_out;  	float* D_in;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	allocateVolume(D_in, width+2, height+2, pitch); +	allocateVolume(D_in, width, height, pitch);  	copyVolumeToDevice(in, width, width, height, D_in, pitch); -	processVol<op, t>(D_out, D_in, param, pitch, width, height); +	processVol<op>(D_out, D_in, param, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width  	cudaFree(D_in);  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)  {  	float* D_out; @@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int  	float* D_in2;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	allocateVolume(D_in1, width+2, height+2, pitch); +	allocateVolume(D_in1, width, height, pitch);  	copyVolumeToDevice(in1, width, width, height, D_in1, pitch); -	allocateVolume(D_in2, width+2, height+2, pitch); +	allocateVolume(D_in2, width, height, pitch);  	copyVolumeToDevice(in2, width, width, height, D_in2, pitch); -	processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height); +	processVol<op>(D_out, D_in1, D_in2, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int  	cudaFree(D_in2);  } -template<typename op, VolType t> +template<typename op>  void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)  {  	float* D_out; @@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,  	float* D_in2;  	unsigned int pitch; -	allocateVolume(D_out, width+2, height+2, pitch); +	allocateVolume(D_out, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_out, pitch); -	allocateVolume(D_in1, width+2, height+2, pitch); +	allocateVolume(D_in1, width, height, pitch);  	copyVolumeToDevice(in1, width, width, height, D_in1, pitch); -	allocateVolume(D_in2, width+2, height+2, pitch); +	allocateVolume(D_in2, width, height, pitch);  	copyVolumeToDevice(in2, width, width, height, D_in2, pitch); -	processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height); +	processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_out, pitch); @@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+511)/512); -	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); +	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); +	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); +	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); +	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); +	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); +	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);  	cudaTextForceKernelsCompletion();  } -template<typename op, VolType t> +template<typename op>  void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)  {  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); +	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);  	cudaTextForceKernelsCompletion();  } @@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  	} @@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  	} @@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo  	unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut1 += step;  		pfOut2 += step;  	} @@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  		pfIn += step;  	} @@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  		pfIn += step;  	} @@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  		pfIn1 += step;  		pfIn2 += step; @@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc  	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;  	for (unsigned int i = 0; i < dims.iVolZ; ++i) { -		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); +		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);  		pfOut += step;  		pfIn1 += step;  		pfIn2 += step; @@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  	} @@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  	} @@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl  	unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut1 += step;  		pfOut2 += step;  	} @@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  		pfIn += step;  	} @@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  		pfIn += step;  	} @@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  		pfIn1 += step;  		pfIn2 += step; @@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit  	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;  	for (unsigned int i = 0; i < dims.iProjV; ++i) { -		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); +		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);  		pfOut += step;  		pfIn1 += step;  		pfIn2 += step; @@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit  #define INST_DFtoD(name) \ -  template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);  #define INST_DtoD(name) \ -  template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);  #define INST_DDtoD(name) \ -  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);  #define INST_DDFtoD(name) \ -  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);  #define INST_toD(name) \ -  template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);  #define INST_FtoD(name) \ -  template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);  #define INST_FFtoDD(name) \ -  template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ -  template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ -  template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ -  template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ +  template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ +  template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \    template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \    template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h index c8c7b41..d745aef 100644 --- a/cuda/2d/arith.h +++ b/cuda/2d/arith.h @@ -55,28 +55,21 @@ struct opSetMaskedValues;  struct opMulMask; - -enum VolType { -  SINO = 0, -  VOL = 1 -}; - - -template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); - -template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, float param, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); +template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); + +template<typename op> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);  template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);  template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu index 2240629..1c2e623 100644 --- a/cuda/2d/astra.cu +++ b/cuda/2d/astra.cu @@ -206,13 +206,13 @@ bool AstraFBP::init(int iGPUIndex)  		}  	} -	bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch); +	bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth, pData->dims.iVolHeight, pData->volumePitch);  	if (!ok)  	{  		return false;  	} -	ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch); +	ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets, pData->dims.iProjAngles, pData->sinoPitch);  	if (!ok)  	{  		cudaFree(pData->D_volumeData); @@ -241,7 +241,7 @@ bool AstraFBP::setSinogram(const float* pfSinogram,  		return false;  	// rescale sinogram to adjust for pixel size -	processVol<opMul,SINO>(pData->D_sinoData, +	processVol<opMul>(pData->D_sinoData,  	                       1.0f/(pData->fPixelSize*pData->fPixelSize),  	                       pData->sinoPitch,  	                       pData->dims.iProjDets, pData->dims.iProjAngles); @@ -270,7 +270,7 @@ bool AstraFBP::run()  		return false;  	} -	zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2); +	zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth, pData->dims.iVolHeight);  	bool ok = false; @@ -283,11 +283,11 @@ bool AstraFBP::run()  		allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); -		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); +		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);  		applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter); -		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); +		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);  		freeComplexOnDevice(pDevComplexSinogram); @@ -299,7 +299,7 @@ bool AstraFBP::run()  		return false;  	} -	processVol<opMul,VOL>(pData->D_volumeData, +	processVol<opMul>(pData->D_volumeData,  	                      (M_PI / 2.0f) / (float)pData->dims.iProjAngles,  	                      pData->volumePitch,  	                      pData->dims.iVolWidth, pData->dims.iVolHeight); @@ -443,7 +443,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =  			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);  			delete[] pfHostRealFilter; -			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); +			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);  			cudaFree(pfDevRealFilter); @@ -478,7 +478,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =  			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);  			delete[] pfHostRealFilter; -			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); +			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);  			cudaFree(pfDevRealFilter); @@ -515,7 +515,7 @@ bool BPalgo::init()  bool BPalgo::iterate(unsigned int)  {  	// TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant -	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);  	callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);  	return true;  } @@ -525,12 +525,12 @@ float BPalgo::computeDiffNorm()  	float *D_projData;  	unsigned int projPitch; -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice);  	callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); -	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	cudaFree(D_projData); @@ -579,14 +579,14 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,  	float* D_volumeData;  	unsigned int volumePitch; -	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	if (!ok)  		return false;  	float* D_sinoData;  	unsigned int sinoPitch; -	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); +	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);  	if (!ok) {  		cudaFree(D_volumeData);  		return false; @@ -601,7 +601,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,  		return false;  	} -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f);  	if (!ok) {  		cudaFree(D_volumeData); @@ -666,14 +666,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,  	float* D_volumeData;  	unsigned int volumePitch; -	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	if (!ok)  		return false;  	float* D_sinoData;  	unsigned int sinoPitch; -	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); +	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);  	if (!ok) {  		cudaFree(D_volumeData);  		return false; @@ -688,7 +688,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,  		return false;  	} -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	// TODO: Turn this geometry conversion into a util function  	SFanProjection* projs = new SFanProjection[dims.iProjAngles]; @@ -777,14 +777,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,  	float* D_volumeData;  	unsigned int volumePitch; -	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	if (!ok)  		return false;  	float* D_sinoData;  	unsigned int sinoPitch; -	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); +	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);  	if (!ok) {  		cudaFree(D_volumeData);  		return false; @@ -799,7 +799,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,  		return false;  	} -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f); diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu index 5b1cf46..df8db0b 100644 --- a/cuda/2d/cgls.cu +++ b/cuda/2d/cgls.cu @@ -73,16 +73,16 @@ void CGLS::reset()  bool CGLS::init()  {  	// Lifetime of z: within an iteration -	allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch); +	allocateVolume(D_z, dims.iVolWidth, dims.iVolHeight, zPitch);  	// Lifetime of p: full algorithm -	allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch); +	allocateVolume(D_p, dims.iVolWidth, dims.iVolHeight, pPitch);  	// Lifetime of r: full algorithm -	allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch); +	allocateVolume(D_r, dims.iProjDets, dims.iProjAngles, rPitch);  	// Lifetime of w: within an iteration -	allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch); +	allocateVolume(D_w, dims.iProjDets, dims.iProjAngles, wPitch);  	// TODO: check if allocations succeeded  	return true; @@ -120,13 +120,13 @@ bool CGLS::iterate(unsigned int iterations)  	if (!sliceInitialized) {  		// copy sinogram -		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  		// r = sino - A*x  		if (useVolumeMask) {  			// Use z as temporary storage here since it is unused -			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); +			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);  			callFP(D_z, zPitch, D_r, rPitch, -1.0f);  		} else {  			callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f); @@ -134,13 +134,13 @@ bool CGLS::iterate(unsigned int iterations)  		// p = A'*r -		zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		zeroVolume(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);  		callBP(D_p, pPitch, D_r, rPitch);  		if (useVolumeMask) -			processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opMul>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); -		gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); +		gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);  		sliceInitialized = true;  	} @@ -150,32 +150,32 @@ bool CGLS::iterate(unsigned int iterations)  	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {  		// w = A*p -		zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles); +		zeroVolume(D_w, wPitch, dims.iProjDets, dims.iProjAngles);  		callFP(D_p, pPitch, D_w, wPitch, 1.0f);  		// alpha = gamma / <w,w> -		float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +		float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);  		float alpha = gamma / ww;  		// x += alpha*p -		processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opAddScaled>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);  		// r -= alpha*w -		processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); +		processVol<opAddScaled>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);  		// z = A'*r -		zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		zeroVolume(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);  		callBP(D_z, zPitch, D_r, rPitch);  		if (useVolumeMask) -			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);  		float beta = 1.0f / gamma; -		gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); +		gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);  		beta *= gamma;  		// p = z + beta*p -		processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opScaleAndAdd>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);  	} @@ -189,12 +189,12 @@ float CGLS::computeDiffNorm()  	// used outside of iterations.  	// copy sinogram to w -	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); +			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);  			callFP(D_z, zPitch, D_w, wPitch, -1.0f);  	} else {  			callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f); @@ -202,7 +202,7 @@ float CGLS::computeDiffNorm()  	// compute norm of D_w -	float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);  	return sqrt(s);  } @@ -264,12 +264,12 @@ int main()  	dims.iRaysPerDet = 1;  	unsigned int volumePitch, sinoPitch; -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); -	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); +	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	printf("pitch: %u\n", sinoPitch);  	unsigned int y, x; diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu index 768d14e..b61eaae 100644 --- a/cuda/2d/darthelper.cu +++ b/cuda/2d/darthelper.cu @@ -33,7 +33,7 @@ $Id$  namespace astraCUDA {  // CUDA function for the selection of ROI -__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height)  {  	float x = (float)(threadIdx.x + 16*blockIdx.x);  	float y = (float)(threadIdx.y + 16*blockIdx.y); @@ -44,7 +44,7 @@ __global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsign  	if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f)   	{  		float* d = (float*)in; -		unsigned int o = (y+padY)*pitch+x+padX;  +		unsigned int o = y*pitch+x;   		d[o] = 0.0f;  	}  } @@ -54,12 +54,12 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height  	float* D_data;  	unsigned int pitch; -	allocateVolume(D_data, width+2, height+2, pitch); +	allocateVolume(D_data, width, height, pitch);  	copyVolumeToDevice(out, width, width, height, D_data, pitch);  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16); -	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1); +	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_data, pitch); @@ -70,7 +70,7 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height  // CUDA function for the masking of DART with a radius == 1 -__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -80,7 +80,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns  		float* d = (float*)in;  		float* m = (float*)mask; -		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. +		unsigned int o2 = y*pitch+x; // On this row.  		unsigned int o1 = o2 - pitch; // On previous row.  		unsigned int o3 = o2 + pitch; // On next row. @@ -101,7 +101,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns  // CUDA function for the masking of DART with a radius > 1 -__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -115,13 +115,13 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con  		int r = radius;  		// o2: index of the current center pixel -		int o2 = (y+padY)*pitch+x+padX; +		int o2 = y*pitch+x;  		if (conn == 8) // 8-connected  		{  			for (int row = -r; row <= r; row++)   			{ -				int o1 = (y+padY+row)*pitch+x+padX;  +				int o1 = (y+row)*pitch+x;   				for (int col = -r; col <= r; col++)   				{  					if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} @@ -131,7 +131,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con  		else if (conn == 4) // 4-connected  		{  			// horizontal -			unsigned int o1 = (y+padY)*pitch+x+padX;  +			unsigned int o1 = y*pitch+x;   			for (int col = -r; col <= r; col++)   			{  				if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} @@ -140,7 +140,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con  			// vertical  			for (int row = -r; row <= r; row++)   			{ -				unsigned int o1 = (y+padY+row)*pitch+x+padX;  +				unsigned int o1 = (y+row)*pitch+x;   				if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}  			}  		} @@ -149,7 +149,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con  // CUDA function for the masking of ADART with a radius == 1 -__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -159,7 +159,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un  		float* d = (float*)in;  		float* m = (float*)mask; -		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. +		unsigned int o2 = y*pitch+x; // On this row.  		unsigned int o1 = o2 - pitch; // On previous row.  		unsigned int o3 = o2 + pitch; // On next row. @@ -186,7 +186,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un  // CUDA function for the masking of ADART with a radius > 1 -__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -199,13 +199,13 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co  		int r = radius; -		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. +		unsigned int o2 = y*pitch+x; // On this row.  		if (conn == 8)  		{  			for (int row = -r; row <= r; row++)   			{ -				unsigned int o1 = (y+padY+row)*pitch+x+padX;  +				unsigned int o1 = (y+row)*pitch+x;   				for (int col = -r; col <= r; col++)   				{  					if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} @@ -223,7 +223,7 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co  			// vertical  			for (int row = -r; row <= r; row++)   			{ -				unsigned int o1 = (y+padY+row)*pitch+x+padX;  +				unsigned int o1 = (y+row)*pitch+x;   				if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}  			}  		} @@ -237,23 +237,23 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne  	float* D_maskData;  	unsigned int pitch; -	allocateVolume(D_segmentationData, width+2, height+2, pitch); +	allocateVolume(D_segmentationData, width, height, pitch);  	copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch); -	allocateVolume(D_maskData, width+2, height+2, pitch); -	zeroVolume(D_maskData, pitch, width+2, height+2); +	allocateVolume(D_maskData, width, height, pitch); +	zeroVolume(D_maskData, pitch, width, height);  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16);  	if (threshold == 1 && radius == 1) -		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1); +		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height);  	else if (threshold > 1 && radius == 1) -		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); +		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height);  	else if (threshold == 1 && radius > 1) -		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); +		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height);  	else  -		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); +		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height);  	copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch); @@ -263,7 +263,7 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne  } -__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -274,13 +274,13 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns  		float* d = (float*)in;  		float* m = (float*)out; -		unsigned int o2 = (y+padY)*pitch+x+padX; +		unsigned int o2 = y*pitch+x;  		int r = radius;  		float res = -d[o2];  		for (int row = -r; row < r; row++)   		{ -			unsigned int o1 = (y+padY+row)*pitch+x+padX;  +			unsigned int o1 = (y+row)*pitch+x;   			for (int col = -r; col <= r; col++)   			{  				res += d[o1+col]; @@ -295,7 +295,7 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns  } -__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	unsigned int y = threadIdx.y + 16*blockIdx.y; @@ -305,7 +305,7 @@ __global__ void devDartSmoothing(float* out, const float* in, float b, unsigned  		float* d = (float*)in;  		float* m = (float*)out; -		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. +		unsigned int o2 = y*pitch+x; // On this row.  		unsigned int o1 = o2 - pitch; // On previous row.  		unsigned int o3 = o2 + pitch; // On next row. @@ -329,9 +329,9 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un  	dim3 blockSize(16,16);  	dim3 gridSize((width+15)/16, (height+15)/16);  	if (radius == 1) -		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1); +		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height);  	else -		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1); +		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height);  	copyVolumeFromDevice(out, width, width, height, D_outData, pitch); diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu index 68573b2..9dc193e 100644 --- a/cuda/2d/dataop.cu +++ b/cuda/2d/dataop.cu @@ -39,10 +39,10 @@ void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigne  	float* D_data2;  	unsigned int pitch; -	allocateVolume(D_data1, width+2, height+2, pitch); +	allocateVolume(D_data1, width, height, pitch);  	copyVolumeToDevice(data1, width, width, height, D_data1, pitch); -	allocateVolume(D_data2, width+2, height+2, pitch); +	allocateVolume(D_data2, width, height, pitch);  	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);  	processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height); @@ -59,10 +59,10 @@ void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsig  	float* D_mask;  	unsigned int pitch; -	allocateVolume(D_data, width+2, height+2, pitch); +	allocateVolume(D_data, width, height, pitch);  	copyVolumeToDevice(data, width, width, height, D_data, pitch); -	allocateVolume(D_mask, width+2, height+2, pitch); +	allocateVolume(D_mask, width, height, pitch);  	copyVolumeToDevice(mask, width, width, height, D_mask, pitch);  	processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height); @@ -79,7 +79,7 @@ void operationVolumeMultScalar(float* data, float scalar, unsigned int width, un  	float* D_data;  	unsigned int pitch; -	allocateVolume(D_data, width+2, height+2, pitch); +	allocateVolume(D_data, width, height, pitch);  	copyVolumeToDevice(data, width, width, height, D_data, pitch);  	processVol<opMul, VOL>(D_data, scalar, pitch, width, height); @@ -96,10 +96,10 @@ void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned  	float* D_data2;  	unsigned int pitch; -	allocateVolume(D_data1, width+2, height+2, pitch); +	allocateVolume(D_data1, width, height, pitch);  	copyVolumeToDevice(data1, width, width, height, D_data1, pitch); -	allocateVolume(D_data2, width+2, height+2, pitch); +	allocateVolume(D_data2, width, height, pitch);  	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);  	processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height); @@ -116,7 +116,7 @@ void operationVolumeAddScalar(float* data, float scalar, unsigned int width, uns  	float* D_data;  	unsigned int pitch; -	allocateVolume(D_data, width+2, height+2, pitch); +	allocateVolume(D_data, width, height, pitch);  	copyVolumeToDevice(data, width, width, height, D_data, pitch);  	processVol<opAdd, VOL>(D_data, scalar, pitch, width, height); diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu index 74d1bbf..cf3f10c 100644 --- a/cuda/2d/em.cu +++ b/cuda/2d/em.cu @@ -73,14 +73,14 @@ void EM::reset()  bool EM::init()  { -	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); -	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch); +	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); -	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); -	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); +	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); -	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); +	zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	// We can't precompute pixelWeights when using a volume mask  #if 0  @@ -94,22 +94,22 @@ bool EM::init()  bool EM::precomputeWeights()  { -	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);  #if 0  	if (useSinogramMask) {  		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);  	} else  #endif  	{ -		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); +		processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);  		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);  	} -	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); +	processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);  #if 0  	if (useVolumeMask) {  		// scale pixel weights with mask to zero out masked pixels -		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);  	}  #endif @@ -129,18 +129,18 @@ bool EM::iterate(unsigned int iterations)  	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {  		// Do FP of volumeData  -		zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +		zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  		callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f);  		// Divide sinogram by FP (into projData) -		processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); +		processVol<opDividedBy>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);  		// Do BP of projData into tmpData -		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  		callBP(D_tmpData, tmpPitch, D_projData, projPitch);  		// Multiply volumeData with tmpData divided by pixel weights -		processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opMul2>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);  	} @@ -150,12 +150,12 @@ bool EM::iterate(unsigned int iterations)  float EM::computeDiffNorm()  {  	// copy sinogram to projection data -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); +			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else {  			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -164,7 +164,7 @@ float EM::computeDiffNorm()  	// compute norm of D_projData -	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	return sqrt(s);  } @@ -218,12 +218,12 @@ int main()  	dims.iRaysPerDet = 1;  	unsigned int volumePitch, sinoPitch; -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); -	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); +	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	printf("pitch: %u\n", sinoPitch);  	unsigned int y, x; diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu index 12086c0..9bec25d 100644 --- a/cuda/2d/fan_bp.cu +++ b/cuda/2d/fan_bp.cu @@ -61,12 +61,12 @@ __constant__ float gC_DetUX[g_MaxAngles];  __constant__ float gC_DetUY[g_MaxAngles]; -static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)  {  	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); -	gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp; -	gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp; +	gT_FanProjTexture.addressMode[0] = mode; +	gT_FanProjTexture.addressMode[1] = mode;  	gT_FanProjTexture.filterMode = cudaFilterModeLinear;  	gT_FanProjTexture.normalized = false; @@ -116,12 +116,12 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s  		const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;  		const float fDen = fDetUX * fYD - fDetUY * fXD; -		const float fT = fNum / fDen + 1.0f; +		const float fT = fNum / fDen;  		fVal += tex2D(gT_FanProjTexture, fT, fA);  		fA += 1.0f;  	} -	volData[(Y+1)*volPitch+X+1] += fVal; +	volData[Y*volPitch+X] += fVal;  }  // supersampling version @@ -171,7 +171,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in  				const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;  				const float fDen = fDetUX * fYD - fDetUY * fXD; -				const float fT = fNum / fDen + 1.0f; +				const float fT = fNum / fDen;  				fVal += tex2D(gT_FanProjTexture, fT, fA);  				fY -= fSubStep;  			} @@ -180,7 +180,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in  		fA += 1.0f;  	} -	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);  } @@ -222,7 +222,7 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi  	const float fT = fNum / fDen;  	const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f); -	volData[(Y+1)*volPitch+X+1] += fVal; +	volData[Y*volPitch+X] += fVal;  }  // Weighted BP for use in fan beam FBP @@ -268,12 +268,12 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un  		const float fWeight = fXD*fXD + fYD*fYD; -		const float fT = fNum / fDen + 1.0f; +		const float fT = fNum / fDen;  		fVal += tex2D(gT_FanProjTexture, fT, fA) / fWeight;  		fA += 1.0f;  	} -	volData[(Y+1)*volPitch+X+1] += fVal; +	volData[Y*volPitch+X] += fVal;  } @@ -331,7 +331,7 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,  	// TODO: process angles block by block  	assert(dims.iProjAngles <= g_MaxAngles); -	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	// transfer angles to constant memory  	float* tmp = new float[dims.iProjAngles]; @@ -376,7 +376,7 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,                  const SDimensions& dims, const SFanProjection* angles)  {  	// only one angle -	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); +	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);  	// transfer angle to constant memory  #define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) @@ -442,10 +442,10 @@ int main()  #undef ROTATE0 -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);  	printf("pitch: %u\n", projPitch);  	unsigned int y, x; diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu index cf9f352..b24029c 100644 --- a/cuda/2d/fan_fp.cu +++ b/cuda/2d/fan_fp.cu @@ -67,8 +67,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i  	cudaMallocArray(&dataArray, &channelDesc, width, height);  	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); -	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp; -	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp; +	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeBorder; +	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeBorder;  	gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;  	gT_FanVolumeTexture.normalized = false; @@ -129,8 +129,8 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig  		// intersect ray with first slice -		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f; -		float fX = startSlice + 1.5f; +		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 0.5f; +		float fX = startSlice + 0.5f;  		int endSlice = startSlice + g_blockSlices;  		if (endSlice > dims.iVolWidth) @@ -148,7 +148,7 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig  	} -	projData[angle*projPitch+detector+1] += fVal; +	projData[angle*projPitch+detector] += fVal;  } @@ -201,8 +201,8 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne  		// intersect ray with first slice -		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f; -		float fY = startSlice + 1.5f; +		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 0.5f; +		float fY = startSlice + 0.5f;  		int endSlice = startSlice + g_blockSlices;  		if (endSlice > dims.iVolHeight) @@ -221,7 +221,7 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne  	} -	projData[angle*projPitch+detector+1] += fVal; +	projData[angle*projPitch+detector] += fVal;  }  bool FanFP(float* D_volumeData, unsigned int volumePitch, @@ -233,7 +233,7 @@ bool FanFP(float* D_volumeData, unsigned int volumePitch,  	assert(dims.iProjAngles <= g_MaxAngles);  	cudaArray* D_dataArray; -	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);  	// transfer angles to constant memory  	float* tmp = new float[dims.iProjAngles]; @@ -325,10 +325,10 @@ int main()  #undef ROTATE0 -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);  	printf("pitch: %u\n", projPitch);  	unsigned int y, x; @@ -358,8 +358,8 @@ int main()  			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];  	printf("cpu norm: %f\n", s); -	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); -	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); +	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	printf("gpu norm: %f\n", s);  	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu index 79e4be7..75bd92c 100644 --- a/cuda/2d/fft.cu +++ b/cuda/2d/fft.cu @@ -209,7 +209,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,  }  bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, -                int _iSourcePitch, int _iSourcePadX, int _iProjDets, +                int _iSourcePitch, int _iProjDets,                  int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,                  cufftComplex * _pDevTargetComplex)  { @@ -221,7 +221,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,  	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)  	{ -		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX; +		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch;  		float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;  		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); @@ -241,7 +241,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,  bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,                   float * _pfRealTarget, -                 int _iTargetPitch, int _iTargetPadX, int _iProjDets, +                 int _iTargetPitch, int _iProjDets,                   int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)  {  	float * pfDevRealFFTTarget = NULL; @@ -264,7 +264,7 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,  	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)  	{  		const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; -		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX; +		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch;  		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));  	} diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h index 55324e5..d4428a5 100644 --- a/cuda/2d/fft.h +++ b/cuda/2d/fft.h @@ -45,13 +45,13 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,                                  cufftComplex * _pDevComplexTarget);  bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, -                int _iSourcePitch, int _iSourcePadX, int _iProjDets, +                int _iSourcePitch, int _iProjDets,                  int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,                  cufftComplex * _pDevTargetComplex);  bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,                   float * _pfRealTarget, -                 int _iTargetPitch, int _iTargetPadX, int _iProjDets, +                 int _iTargetPitch, int _iProjDets,                   int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);  void applyFilter(int _iProjectionCount, int _iFreqBinCount, diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu index d202341..fc99102 100644 --- a/cuda/2d/par_bp.cu +++ b/cuda/2d/par_bp.cu @@ -57,12 +57,12 @@ __constant__ float gC_angle_sin[g_MaxAngles];  __constant__ float gC_angle_cos[g_MaxAngles];  __constant__ float gC_angle_offset[g_MaxAngles]; -static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)  {  	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); -	gT_projTexture.addressMode[0] = cudaAddressModeClamp; -	gT_projTexture.addressMode[1] = cudaAddressModeClamp; +	gT_projTexture.addressMode[0] = mode; +	gT_projTexture.addressMode[1] = mode;  	gT_projTexture.filterMode = cudaFilterModeLinear;  	gT_projTexture.normalized = false; @@ -94,7 +94,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star  	float fVal = 0.0f;  	float fA = startAngle + 0.5f; -	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; +	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;  	if (offsets) { @@ -123,7 +123,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star  	} -	volData[(Y+1)*volPitch+X+1] += fVal; +	volData[Y*volPitch+X] += fVal;  }  // supersampling version @@ -150,7 +150,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s  	float fVal = 0.0f;  	float fA = startAngle + 0.5f; -	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; +	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;  	if (offsets) { @@ -196,7 +196,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s  	} -	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);  }  __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims) @@ -218,7 +218,7 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset  	const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;  	const float fVal = tex2D(gT_projTexture, fT, 0.5f); -	D_volData[(Y+1)*volPitch+X+1] += fVal; +	D_volData[Y*volPitch+X] += fVal;  } @@ -232,7 +232,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,  	float* angle_sin = new float[dims.iProjAngles];  	float* angle_cos = new float[dims.iProjAngles]; -	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {  		angle_sin[i] = sinf(angles[i]); @@ -302,8 +302,10 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,               unsigned int angle, const SDimensions& dims,               const float* angles, const float* TOffsets)  { -	// only one angle -	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); +	// Only one angle. +	// We need to Clamp to the border pixels instead of to zero, because +	// SART weights with ray length. +	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);  	float angle_sin = sinf(angles[angle]);  	float angle_cos = cosf(angles[angle]); @@ -346,10 +348,10 @@ int main()  	unsigned int volumePitch, projPitch; -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);  	printf("pitch: %u\n", projPitch);  	unsigned int y, x; diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu index f811f98..097122b 100644 --- a/cuda/2d/par_fp.cu +++ b/cuda/2d/par_fp.cu @@ -74,8 +74,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i  	cudaMallocArray(&dataArray, &channelDesc, width, height);  	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); -	gT_volumeTexture.addressMode[0] = cudaAddressModeClamp; -	gT_volumeTexture.addressMode[1] = cudaAddressModeClamp; +	gT_volumeTexture.addressMode[0] = cudaAddressModeBorder; +	gT_volumeTexture.addressMode[1] = cudaAddressModeBorder;  	gT_volumeTexture.filterMode = cudaFilterModeLinear;  	gT_volumeTexture.normalized = false; @@ -153,8 +153,8 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned  	float fVal = 0.0f;  	// project detector on slice -	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; -	float fS = startSlice + 1.5f; +	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; +	float fS = startSlice + 0.5f;  	int endSlice = startSlice + g_blockSlices;  	if (endSlice > dims.iVolWidth)  		endSlice = dims.iVolWidth; @@ -189,7 +189,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned  	} -	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +	D_projData[angle*projPitch+detector] += fVal * fDistCorr;  }  // projection for angles that are roughly vertical @@ -255,8 +255,8 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i  	fDistCorr *= outputScale;  	float fVal = 0.0f; -	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; -	float fS = startSlice+1.5f; +	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; +	float fS = startSlice+0.5f;  	int endSlice = startSlice + g_blockSlices;  	if (endSlice > dims.iVolHeight)  		endSlice = dims.iVolHeight; @@ -290,7 +290,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i  	} -	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +	D_projData[angle*projPitch+detector] += fVal * fDistCorr;  }  // projection for angles that are roughly horizontal @@ -331,8 +331,8 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u  	float fVal = 0.0f;  	// project detector on slice -	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; -	float fS = startSlice + 1.5f; +	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f; +	float fS = startSlice + 0.5f;  	int endSlice = startSlice + g_blockSlices;  	if (endSlice > dims.iVolWidth)  		endSlice = dims.iVolWidth; @@ -367,7 +367,7 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u  	} -	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +	D_projData[angle*projPitch+detector] += fVal * fDistCorr;  } @@ -408,8 +408,8 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns  	fDistCorr *= outputScale;  	float fVal = 0.0f; -	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; -	float fS = startSlice+1.5f; +	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f; +	float fS = startSlice+0.5f;  	int endSlice = startSlice + g_blockSlices;  	if (endSlice > dims.iVolHeight)  		endSlice = dims.iVolHeight; @@ -443,7 +443,7 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns  	} -	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +	D_projData[angle*projPitch+detector] += fVal * fDistCorr;  } @@ -457,7 +457,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,  	assert(dims.iProjAngles <= g_MaxAngles);  	cudaArray* D_dataArray; -	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);  	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); @@ -579,7 +579,7 @@ bool FP(float* D_volumeData, unsigned int volumePitch,  	assert(dims.fDetScale > 0.9999f);  	cudaArray* D_dataArray; -	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);  	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); @@ -682,10 +682,10 @@ int main()  	dims.iRaysPerDet = 1;  	unsigned int volumePitch, projPitch; -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);  	printf("pitch: %u\n", projPitch);  	unsigned int y, x; @@ -715,7 +715,7 @@ int main()  			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];  	printf("cpu norm: %f\n", s); -	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);  	printf("gpu norm: %f\n", s); diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index a40176d..7f499ce 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -39,14 +39,13 @@ $Id$  namespace astraCUDA { - +// FIXME: Remove these functions. (Outdated)  __global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)  {  	unsigned int x = threadIdx.x + 16*blockIdx.x;  	if (x >= width) return; -	// Copy result down and left one pixel. -	pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1]; +	pfOut[x] *= pfIn[x];  }  void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) @@ -106,18 +105,15 @@ void SART::reset()  bool SART::init()  {  	if (useVolumeMask) { -		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); -		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); +		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  	} -	// HACK: D_projData consists of two lines. The first is used padded, -	// the second unpadded. This is to satisfy the alignment requirements -	// of resp. FP and BP_SART. -	allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch); -	zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1); +	allocateVolume(D_projData, dims.iProjDets, 1, projPitch); +	zeroVolume(D_projData, projPitch, dims.iProjDets, 1); -	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); -	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch); +	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	// We can't precompute lineWeights when using a mask  	if (!useVolumeMask) @@ -142,23 +138,23 @@ bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount)  bool SART::precomputeWeights()  { -	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); +	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	if (useVolumeMask) {  		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);  	} else {  		// Allocate tmpData temporarily -		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); -		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); +		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); -		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);  		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);  		cudaFree(D_tmpData);  		D_tmpData = 0;  	} -	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); +	processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	return true;  } @@ -181,12 +177,12 @@ bool SART::iterate(unsigned int iterations)  		}  		// copy one line of sinogram to projection data -		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice); +		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), 1, cudaMemcpyDeviceToDevice);  		// do FP, subtracting projection from sinogram  		if (useVolumeMask) { -				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); +				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  				callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);  		} else {  				callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f); @@ -197,17 +193,17 @@ bool SART::iterate(unsigned int iterations)  		if (useVolumeMask) {  			// BP, mask, and add back  			// TODO: Try putting the masking directly in the BP -			zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +			zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  			callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle); -			processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opAddMul>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);  		} else {  			callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);  		}  		if (useMinConstraint) -			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);  		if (useMaxConstraint) -			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);  		iteration++; @@ -220,16 +216,16 @@ float SART::computeDiffNorm()  {  	unsigned int pPitch;  	float *D_p; -	allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch); -	zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_p, dims.iProjDets, dims.iProjAngles, pPitch); +	zeroVolume(D_p, pPitch, dims.iProjDets, dims.iProjAngles);  	// copy sinogram to D_p -	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); +			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else {  			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -237,7 +233,7 @@ float SART::computeDiffNorm()  	// compute norm of D_p -	float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles);  	cudaFree(D_p); @@ -267,11 +263,11 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,  {  	if (angles) {  		assert(!fanProjs); -		return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, +		return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,  		               angle, dims, angles, TOffsets);  	} else {  		assert(fanProjs); -		return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, +		return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch,  		                  angle, dims, fanProjs);  	} diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu index 31954e4..eb65962 100644 --- a/cuda/2d/sirt.cu +++ b/cuda/2d/sirt.cu @@ -88,17 +88,17 @@ void SIRT::reset()  bool SIRT::init()  { -	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); -	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch); +	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); -	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); -	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch); +	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight); -	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); -	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch); +	zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); -	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); -	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch); +	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	// We can't precompute lineWeights and pixelWeights when using a mask  	if (!useVolumeMask && !useSinogramMask) @@ -110,33 +110,33 @@ bool SIRT::init()  bool SIRT::precomputeWeights()  { -	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); +	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	if (useVolumeMask) {  		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);  	} else { -		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);  		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);  	} -	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); +	processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);  	if (useSinogramMask) {  		// scale line weights with sinogram mask to zero out masked sinogram pixels -		processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); +		processVol<opMul>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);  	} -	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);  	if (useSinogramMask) {  		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);  	} else { -		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); +		processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);  		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);  	} -	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); +	processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);  	if (useVolumeMask) {  		// scale pixel weights with mask to zero out masked pixels -		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);  	}  	return true; @@ -160,7 +160,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD  	freeMinMaxMasks = true;  	bool ok = true;  	if (pfMinMaskData) { -		allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch); +		allocateVolume(D_minMaskData, dims.iVolWidth, dims.iVolHeight, minMaskPitch);  		ok = copyVolumeToDevice(pfMinMaskData, iPitch,  		                        dims.iVolWidth, dims.iVolHeight,  		                        D_minMaskData, minMaskPitch); @@ -169,7 +169,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD  		return false;  	if (pfMaxMaskData) { -		allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch); +		allocateVolume(D_maxMaskData, dims.iVolWidth, dims.iVolHeight, maxMaskPitch);  		ok = copyVolumeToDevice(pfMaxMaskData, iPitch,  		                        dims.iVolWidth, dims.iVolHeight,  		                        D_maxMaskData, maxMaskPitch); @@ -191,33 +191,33 @@ bool SIRT::iterate(unsigned int iterations)  	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {  		// copy sinogram to projection data -		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  		// do FP, subtracting projection from sinogram  		if (useVolumeMask) { -				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); +				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  				callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  		} else {  				callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);  		} -		processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); +		processVol<opMul>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); -		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); +		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  		callBP(D_tmpData, tmpPitch, D_projData, projPitch); -		processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); +		processVol<opAddMul>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);  		if (useMinConstraint) -			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);  		if (useMaxConstraint) -			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);  		if (D_minMaskData) -			processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMinMask>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);  		if (D_maxMaskData) -			processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); +			processVol<opClampMaxMask>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);  	}  	return true; @@ -226,12 +226,12 @@ bool SIRT::iterate(unsigned int iterations)  float SIRT::computeDiffNorm()  {  	// copy sinogram to projection data -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); -			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); +			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else {  			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -240,7 +240,7 @@ float SIRT::computeDiffNorm()  	// compute norm of D_projData -	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); +	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);  	return sqrt(s);  } @@ -300,12 +300,12 @@ int main()  	dims.iRaysPerDet = 1;  	unsigned int volumePitch, sinoPitch; -	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); -	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); +	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch); +	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);  	printf("pitch: %u\n", volumePitch); -	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); -	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); +	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch); +	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);  	printf("pitch: %u\n", sinoPitch);  	unsigned int y, x; diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 06f6714..8bb2f2f 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -36,11 +36,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,  		unsigned int width, unsigned int height,  		float* outD_data, unsigned int out_pitch)  { -	// TODO: a full memset isn't necessary. Only the edges.  	cudaError_t err; -	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2); -	ASTRA_CUDA_ASSERT(err); -	err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); +	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);  	ASTRA_CUDA_ASSERT(err);  	assert(err == cudaSuccess);  	return true; @@ -50,7 +47,7 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,  		unsigned int width, unsigned int height,  		float* inD_data, unsigned int in_pitch)  { -	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); +	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);  	ASTRA_CUDA_ASSERT(err);  	return true;  } @@ -60,7 +57,7 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,  		unsigned int width, unsigned int height,  		float* inD_data, unsigned int in_pitch)  {    -	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); +	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);  	ASTRA_CUDA_ASSERT(err);  	return true;  } @@ -69,11 +66,8 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,  		unsigned int width, unsigned int height,  		float* outD_data, unsigned int out_pitch)  {    -	// TODO: a full memset isn't necessary. Only the edges.  	cudaError_t err; -	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height); -	ASTRA_CUDA_ASSERT(err); -	err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); +	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);  	ASTRA_CUDA_ASSERT(err);  	return true;  } @@ -132,8 +126,7 @@ __global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)  __global__ void reduce2D(float *g_idata, float *g_odata,                           unsigned int pitch, -                         unsigned int nx, unsigned int ny, -                         unsigned int padX, unsigned int padY) +                         unsigned int nx, unsigned int ny)  {  	extern __shared__ float sdata[];  	const unsigned int tidx = threadIdx.x; @@ -145,11 +138,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,  	sdata[tid] = 0; -	if (x >= padX && x < padX + nx) { +	if (x < nx) { -		while (y < padY + ny) { -			if (y >= padY) -				sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); +		while (y < ny) { +			sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);  			y += 16 * gridDim.y;  		} @@ -180,11 +172,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,  }  float dotProduct2D(float* D_data, unsigned int pitch, -                   unsigned int width, unsigned int height, -                   unsigned int padX, unsigned int padY) +                   unsigned int width, unsigned int height)  { -	unsigned int bx = ((width+padX) + 15) / 16; -	unsigned int by = ((height+padY) + 127) / 128; +	unsigned int bx = (width + 15) / 16; +	unsigned int by = (height + 127) / 128;  	unsigned int shared_mem2 = sizeof(float) * 16 * 16;  	dim3 dimBlock2(16, 16); @@ -192,26 +183,27 @@ float dotProduct2D(float* D_data, unsigned int pitch,  	float* D_buf;  	cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) ); +	float* D_res = D_buf + (bx*by);  	// Step 1: reduce 2D from image to a single vector, taking sum of squares -	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY); +	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);  	cudaTextForceKernelsCompletion();  	// Step 2: reduce 1D: add up elements in vector  	if (bx * by > 512) -		reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by); +		reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_res, bx*by);  	else if (bx * by > 128) -		reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by); +		reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_res, bx*by);  	else if (bx * by > 32) -		reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by); +		reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_res, bx*by);  	else if (bx * by > 8) -		reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by); +		reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_res, bx*by);  	else -		reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by); +		reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_res, bx*by);  	float x; -	cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost); +	cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);  	cudaTextForceKernelsCompletion(); diff --git a/cuda/2d/util.h b/cuda/2d/util.h index d31e2eb..ed4a45c 100644 --- a/cuda/2d/util.h +++ b/cuda/2d/util.h @@ -82,8 +82,8 @@ void reportCudaError(cudaError_t err);  float dotProduct2D(float* D_data, unsigned int pitch, -                   unsigned int width, unsigned int height, -                   unsigned int padX, unsigned int padY); +                   unsigned int width, unsigned int height); +  } | 
