18 files changed, 365 insertions, 396 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index f04607f..71cbfb3 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -214,11 +214,11 @@ bool ReconAlgo::setMaxConstraint(float fMax)
 bool ReconAlgo::allocateBuffers()
 {
 	bool ok;
-	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	if (!ok)
 		return false;
 
-	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
 	if (!ok) {
 		cudaFree(D_volumeData);
 		D_volumeData = 0;
@@ -226,7 +226,7 @@ bool ReconAlgo::allocateBuffers()
 	}
 
 	if (useVolumeMask) {
-		ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch);
+		ok = allocateVolume(D_maskData, dims.iVolWidth, dims.iVolHeight, maskPitch);
 		if (!ok) {
 			cudaFree(D_volumeData);
 			cudaFree(D_sinoData);
@@ -237,7 +237,7 @@ bool ReconAlgo::allocateBuffers()
 	}
 
 	if (useSinogramMask) {
-		ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch);
+		ok = allocateVolume(D_smaskData, dims.iProjDets, dims.iProjAngles, smaskPitch);
 		if (!ok) {
 			cudaFree(D_volumeData);
 			cudaFree(D_sinoData);
@@ -271,7 +271,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit
 		return false;
 
 	// rescale sinogram to adjust for pixel size
-	processVol<opMul,SINO>(D_sinoData, fSinogramScale,
+	processVol<opMul>(D_sinoData, fSinogramScale,
 	                       //1.0f/(fPixelSize*fPixelSize),
 	                       sinoPitch,
 	                       dims.iProjDets, dims.iProjAngles);
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index 1ee02ca..42c2c98 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -144,14 +144,14 @@ struct opMulMask {
 
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -161,14 +161,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -178,14 +178,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -197,14 +197,14 @@ __global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fPa
 
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -214,14 +214,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -231,14 +231,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -248,14 +248,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -280,51 +280,51 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
 
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, unsigned int width, unsigned int height)
 {
 	float* D_out;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
 
-	processVol<op, t>(D_out, pitch, width, height);
+	processVol<op>(D_out, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
 	cudaFree(D_out);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
 
-	processVol<op, t>(D_out, param, pitch, width, height);
+	processVol<op>(D_out, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
 	cudaFree(D_out);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
 {
 	float* D_out1;
 	float* D_out2;
 
 	unsigned int pitch;
-	allocateVolume(D_out1, width+2, height+2, pitch);
+	allocateVolume(D_out1, width, height, pitch);
 	copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
-	allocateVolume(D_out2, width+2, height+2, pitch);
+	allocateVolume(D_out2, width, height, pitch);
 	copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
 
-	processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+	processVol<op>(D_out1, D_out2, param1, param2, pitch, width, height);
 
 	copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
 	copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
@@ -334,19 +334,19 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign
 }
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
 {
 	float* D_out;
 	float* D_in;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in, width+2, height+2, pitch);
+	allocateVolume(D_in, width, height, pitch);
 	copyVolumeToDevice(in, width, width, height, D_in, pitch);
 
-	processVol<op, t>(D_out, D_in, pitch, width, height);
+	processVol<op>(D_out, D_in, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -354,19 +354,19 @@ void processVolCopy(float* out, const float* in, unsigned int width, unsigned in
 	cudaFree(D_in);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
 	float* D_in;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in, width+2, height+2, pitch);
+	allocateVolume(D_in, width, height, pitch);
 	copyVolumeToDevice(in, width, width, height, D_in, pitch);
 
-	processVol<op, t>(D_out, D_in, param, pitch, width, height);
+	processVol<op>(D_out, D_in, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -374,7 +374,7 @@ void processVolCopy(float* out, const float* in, float param, unsigned int width
 	cudaFree(D_in);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
 {
 	float* D_out;
@@ -382,14 +382,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
 	float* D_in2;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in1, width+2, height+2, pitch);
+	allocateVolume(D_in1, width, height, pitch);
 	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
-	allocateVolume(D_in2, width+2, height+2, pitch);
+	allocateVolume(D_in2, width, height, pitch);
 	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
 
-	processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+	processVol<op>(D_out, D_in1, D_in2, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -398,7 +398,7 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int
 	cudaFree(D_in2);
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
 {
 	float* D_out;
@@ -406,14 +406,14 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
 	float* D_in2;
 
 	unsigned int pitch;
-	allocateVolume(D_out, width+2, height+2, pitch);
+	allocateVolume(D_out, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_out, pitch);
-	allocateVolume(D_in1, width+2, height+2, pitch);
+	allocateVolume(D_in1, width, height, pitch);
 	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
-	allocateVolume(D_in2, width+2, height+2, pitch);
+	allocateVolume(D_in2, width, height, pitch);
 	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
 
-	processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+	processVol<op>(D_out, D_in1, D_in2, param, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
 
@@ -430,80 +430,80 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param,
 
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+511)/512);
 
-	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
-	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
@@ -533,7 +533,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -549,7 +549,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -566,7 +566,7 @@ void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, flo
 	unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut1 += step;
 		pfOut2 += step;
 	}
@@ -585,7 +585,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -603,7 +603,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -622,7 +622,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -642,7 +642,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -672,7 +672,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -688,7 +688,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -705,7 +705,7 @@ void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, fl
 	unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut1 += step;
 		pfOut2 += step;
 	}
@@ -724,7 +724,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -742,7 +742,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -761,7 +761,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -781,7 +781,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -808,59 +808,45 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 
 
 #define INST_DFtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
 
 #define INST_DtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
 
 #define INST_DDtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
 
 #define INST_DDFtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
 
 
 #define INST_toD(name) \
-  template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
 
 #define INST_FtoD(name) \
-  template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out, float param, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
 
 #define INST_FFtoDD(name) \
-  template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
-  template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
-  template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVolCopy<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+  template void processVol<name>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
 
diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h
index c8c7b41..d745aef 100644
--- a/cuda/2d/arith.h
+++ b/cuda/2d/arith.h
@@ -55,28 +55,21 @@ struct opSetMaskedValues;
 struct opMulMask;
 
 
-
-enum VolType {
-  SINO = 0,
-  VOL = 1
-};
-
-
-template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
-
-template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
-template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
+template<typename op> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
+
+template<typename op> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
 
 template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
 template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index 2240629..1c2e623 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -206,13 +206,13 @@ bool AstraFBP::init(int iGPUIndex)
 		}
 	}
 
-	bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch);
+	bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth, pData->dims.iVolHeight, pData->volumePitch);
 	if (!ok)
 	{
 		return false;
 	}
 
-	ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch);
+	ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets, pData->dims.iProjAngles, pData->sinoPitch);
 	if (!ok)
 	{
 		cudaFree(pData->D_volumeData);
@@ -241,7 +241,7 @@ bool AstraFBP::setSinogram(const float* pfSinogram,
 		return false;
 
 	// rescale sinogram to adjust for pixel size
-	processVol<opMul,SINO>(pData->D_sinoData,
+	processVol<opMul>(pData->D_sinoData,
 	                       1.0f/(pData->fPixelSize*pData->fPixelSize),
 	                       pData->sinoPitch,
 	                       pData->dims.iProjDets, pData->dims.iProjAngles);
@@ -270,7 +270,7 @@ bool AstraFBP::run()
 		return false;
 	}
 
-	zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2);
+	zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth, pData->dims.iVolHeight);
 
 	bool ok = false;
 
@@ -283,11 +283,11 @@ bool AstraFBP::run()
 
 		allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
 
-		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
+		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
 
 		applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter);
 
-		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
+		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
 
 		freeComplexOnDevice(pDevComplexSinogram);
 
@@ -299,7 +299,7 @@ bool AstraFBP::run()
 		return false;
 	}
 
-	processVol<opMul,VOL>(pData->D_volumeData,
+	processVol<opMul>(pData->D_volumeData,
 	                      (M_PI / 2.0f) / (float)pData->dims.iProjAngles,
 	                      pData->volumePitch,
 	                      pData->dims.iVolWidth, pData->dims.iVolHeight);
@@ -443,7 +443,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =
 			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
 			delete[] pfHostRealFilter;
 
-			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
 
 			cudaFree(pfDevRealFilter);
 
@@ -478,7 +478,7 @@ bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* =
 			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
 			delete[] pfHostRealFilter;
 
-			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
 
 			cudaFree(pfDevRealFilter);
 
@@ -515,7 +515,7 @@ bool BPalgo::init()
 bool BPalgo::iterate(unsigned int)
 {
 	// TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 	callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);
 	return true;
 }
@@ -525,12 +525,12 @@ float BPalgo::computeDiffNorm()
 	float *D_projData;
 	unsigned int projPitch;
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
 
-	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice);
 	callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
 
-	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	cudaFree(D_projData);
 
@@ -579,14 +579,14 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,
 	float* D_volumeData;
 	unsigned int volumePitch;
 
-	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	if (!ok)
 		return false;
 
 	float* D_sinoData;
 	unsigned int sinoPitch;
 
-	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
 	if (!ok) {
 		cudaFree(D_volumeData);
 		return false;
@@ -601,7 +601,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,
 		return false;
 	}
 
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 	ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f);
 	if (!ok) {
 		cudaFree(D_volumeData);
@@ -666,14 +666,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 	float* D_volumeData;
 	unsigned int volumePitch;
 
-	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	if (!ok)
 		return false;
 
 	float* D_sinoData;
 	unsigned int sinoPitch;
 
-	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
 	if (!ok) {
 		cudaFree(D_volumeData);
 		return false;
@@ -688,7 +688,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 		return false;
 	}
 
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 
 	// TODO: Turn this geometry conversion into a util function
 	SFanProjection* projs = new SFanProjection[dims.iProjAngles];
@@ -777,14 +777,14 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 	float* D_volumeData;
 	unsigned int volumePitch;
 
-	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	ok = allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	if (!ok)
 		return false;
 
 	float* D_sinoData;
 	unsigned int sinoPitch;
 
-	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	ok = allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
 	if (!ok) {
 		cudaFree(D_volumeData);
 		return false;
@@ -799,7 +799,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 		return false;
 	}
 
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 
 	ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f);
 
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index 5b1cf46..df8db0b 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -73,16 +73,16 @@ void CGLS::reset()
 bool CGLS::init()
 {
 	// Lifetime of z: within an iteration
-	allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch);
+	allocateVolume(D_z, dims.iVolWidth, dims.iVolHeight, zPitch);
 
 	// Lifetime of p: full algorithm
-	allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch);
+	allocateVolume(D_p, dims.iVolWidth, dims.iVolHeight, pPitch);
 
 	// Lifetime of r: full algorithm
-	allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch);
+	allocateVolume(D_r, dims.iProjDets, dims.iProjAngles, rPitch);
 	
 	// Lifetime of w: within an iteration
-	allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch);
+	allocateVolume(D_w, dims.iProjDets, dims.iProjAngles, wPitch);
 
 	// TODO: check if allocations succeeded
 	return true;
@@ -120,13 +120,13 @@ bool CGLS::iterate(unsigned int iterations)
 	if (!sliceInitialized) {
 
 		// copy sinogram
-		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 		// r = sino - A*x
 		if (useVolumeMask) {
 			// Use z as temporary storage here since it is unused
-			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
 			callFP(D_z, zPitch, D_r, rPitch, -1.0f);
 		} else {
 			callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f);
@@ -134,13 +134,13 @@ bool CGLS::iterate(unsigned int iterations)
 
 
 		// p = A'*r
-		zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		zeroVolume(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);
 		callBP(D_p, pPitch, D_r, rPitch);
 		if (useVolumeMask)
-			processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opMul>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
 
 
-		gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+		gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight);
 
 		sliceInitialized = true;
 	}
@@ -150,32 +150,32 @@ bool CGLS::iterate(unsigned int iterations)
 	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
 
 		// w = A*p
-		zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles);
+		zeroVolume(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
 		callFP(D_p, pPitch, D_w, wPitch, 1.0f);
 
 		// alpha = gamma / <w,w>
-		float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+		float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
 		float alpha = gamma / ww;
 
 		// x += alpha*p
-		processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opAddScaled>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 		// r -= alpha*w
-		processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opAddScaled>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
 
 
 		// z = A'*r
-		zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		zeroVolume(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);
 		callBP(D_z, zPitch, D_r, rPitch);
 		if (useVolumeMask)
-			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
 
 		float beta = 1.0f / gamma;
-		gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+		gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight);
 		beta *= gamma;
 
 		// p = z + beta*p
-		processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opScaleAndAdd>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
 
 	}
 
@@ -189,12 +189,12 @@ float CGLS::computeDiffNorm()
 	// used outside of iterations.
 
 	// copy sinogram to w
-	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 	// do FP, subtracting projection from sinogram
 	if (useVolumeMask) {
-			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+			processVol<opMul>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
 			callFP(D_z, zPitch, D_w, wPitch, -1.0f);
 	} else {
 			callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f);
@@ -202,7 +202,7 @@ float CGLS::computeDiffNorm()
 
 	// compute norm of D_w
 
-	float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles);
 
 	return sqrt(s);
 }
@@ -264,12 +264,12 @@ int main()
 	dims.iRaysPerDet = 1;
 	unsigned int volumePitch, sinoPitch;
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 	printf("pitch: %u\n", sinoPitch);
 	
 	unsigned int y, x;
diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu
index 768d14e..b61eaae 100644
--- a/cuda/2d/darthelper.cu
+++ b/cuda/2d/darthelper.cu
@@ -33,7 +33,7 @@ $Id$
 namespace astraCUDA {
 
 // CUDA function for the selection of ROI
-__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	float x = (float)(threadIdx.x + 16*blockIdx.x);
 	float y = (float)(threadIdx.y + 16*blockIdx.y);
@@ -44,7 +44,7 @@ __global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsign
 	if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) 
 	{
 		float* d = (float*)in;
-		unsigned int o = (y+padY)*pitch+x+padX; 
+		unsigned int o = y*pitch+x; 
 		d[o] = 0.0f;
 	}
 }
@@ -54,12 +54,12 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height
 	float* D_data;
 
 	unsigned int pitch;
-	allocateVolume(D_data, width+2, height+2, pitch);
+	allocateVolume(D_data, width, height, pitch);
 	copyVolumeToDevice(out, width, width, height, D_data, pitch);
 
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
-	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1);
+	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_data, pitch);
 
@@ -70,7 +70,7 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height
 
 
 // CUDA function for the masking of DART with a radius == 1
-__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -80,7 +80,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns
 		float* d = (float*)in;
 		float* m = (float*)mask;
 
-		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o2 = y*pitch+x; // On this row.
 		unsigned int o1 = o2 - pitch; // On previous row.
 		unsigned int o3 = o2 + pitch; // On next row.
 
@@ -101,7 +101,7 @@ __global__ void devDartMask(float* mask, const float* in, unsigned int conn, uns
 
 
 // CUDA function for the masking of DART with a radius > 1
-__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -115,13 +115,13 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
 		int r = radius;
 
 		// o2: index of the current center pixel
-		int o2 = (y+padY)*pitch+x+padX;
+		int o2 = y*pitch+x;
 
 		if (conn == 8) // 8-connected
 		{
 			for (int row = -r; row <= r; row++) 
 			{
-				int o1 = (y+padY+row)*pitch+x+padX; 
+				int o1 = (y+row)*pitch+x; 
 				for (int col = -r; col <= r; col++) 
 				{
 					if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
@@ -131,7 +131,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
 		else if (conn == 4) // 4-connected
 		{
 			// horizontal
-			unsigned int o1 = (y+padY)*pitch+x+padX; 
+			unsigned int o1 = y*pitch+x; 
 			for (int col = -r; col <= r; col++) 
 			{
 				if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
@@ -140,7 +140,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
 			// vertical
 			for (int row = -r; row <= r; row++) 
 			{
-				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				unsigned int o1 = (y+row)*pitch+x; 
 				if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}
 			}
 		}
@@ -149,7 +149,7 @@ __global__ void devDartMaskRadius(float* mask, const float* in, unsigned int con
 
 
 // CUDA function for the masking of ADART with a radius == 1
-__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -159,7 +159,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un
 		float* d = (float*)in;
 		float* m = (float*)mask;
 
-		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o2 = y*pitch+x; // On this row.
 		unsigned int o1 = o2 - pitch; // On previous row.
 		unsigned int o3 = o2 + pitch; // On next row.
 
@@ -186,7 +186,7 @@ __global__ void devADartMask(float* mask, const float* in, unsigned int conn, un
 
 
 // CUDA function for the masking of ADART with a radius > 1
-__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -199,13 +199,13 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co
 	
 		int r = radius;
 
-		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o2 = y*pitch+x; // On this row.
 
 		if (conn == 8)
 		{
 			for (int row = -r; row <= r; row++) 
 			{
-				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				unsigned int o1 = (y+row)*pitch+x; 
 				for (int col = -r; col <= r; col++) 
 				{
 					if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
@@ -223,7 +223,7 @@ __global__ void devADartMaskRadius(float* mask, const float* in, unsigned int co
 			// vertical
 			for (int row = -r; row <= r; row++) 
 			{
-				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				unsigned int o1 = (y+row)*pitch+x; 
 				if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
 			}
 		}
@@ -237,23 +237,23 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne
 	float* D_maskData;
 
 	unsigned int pitch;
-	allocateVolume(D_segmentationData, width+2, height+2, pitch);
+	allocateVolume(D_segmentationData, width, height, pitch);
 	copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch);
 
-	allocateVolume(D_maskData, width+2, height+2, pitch);
-	zeroVolume(D_maskData, pitch, width+2, height+2);
+	allocateVolume(D_maskData, width, height, pitch);
+	zeroVolume(D_maskData, pitch, width, height);
 
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 
 	if (threshold == 1 && radius == 1)
-		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1);
+		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height);
 	else if (threshold > 1 && radius == 1)
-		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height);
 	else if (threshold == 1 && radius > 1)
-		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height);
 	else 
-		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height);
 
 	copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch);
 
@@ -263,7 +263,7 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne
 }
 
 
-__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -274,13 +274,13 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns
 		float* d = (float*)in;
 		float* m = (float*)out;
 
-		unsigned int o2 = (y+padY)*pitch+x+padX;
+		unsigned int o2 = y*pitch+x;
 		int r = radius;
 		float res = -d[o2];
 
 		for (int row = -r; row < r; row++) 
 		{
-			unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+			unsigned int o1 = (y+row)*pitch+x; 
 			for (int col = -r; col <= r; col++) 
 			{
 				res += d[o1+col];
@@ -295,7 +295,7 @@ __global__ void devDartSmoothingRadius(float* out, const float* in, float b, uns
 }
 
 
-__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	unsigned int y = threadIdx.y + 16*blockIdx.y;
@@ -305,7 +305,7 @@ __global__ void devDartSmoothing(float* out, const float* in, float b, unsigned
 		float* d = (float*)in;
 		float* m = (float*)out;
 
-		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o2 = y*pitch+x; // On this row.
 		unsigned int o1 = o2 - pitch; // On previous row.
 		unsigned int o3 = o2 + pitch; // On next row.
 
@@ -329,9 +329,9 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un
 	dim3 blockSize(16,16);
 	dim3 gridSize((width+15)/16, (height+15)/16);
 	if (radius == 1)
-		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1);
+		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height);
 	else
-		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1);
+		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height);
 
 	copyVolumeFromDevice(out, width, width, height, D_outData, pitch);
 
diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu
index 68573b2..9dc193e 100644
--- a/cuda/2d/dataop.cu
+++ b/cuda/2d/dataop.cu
@@ -39,10 +39,10 @@ void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigne
 	float* D_data2;
 
 	unsigned int pitch;
-	allocateVolume(D_data1, width+2, height+2, pitch);
+	allocateVolume(D_data1, width, height, pitch);
 	copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
 
-	allocateVolume(D_data2, width+2, height+2, pitch);
+	allocateVolume(D_data2, width, height, pitch);
 	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
 
 	processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height);
@@ -59,10 +59,10 @@ void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsig
 	float* D_mask;
 
 	unsigned int pitch;
-	allocateVolume(D_data, width+2, height+2, pitch);
+	allocateVolume(D_data, width, height, pitch);
 	copyVolumeToDevice(data, width, width, height, D_data, pitch);
 
-	allocateVolume(D_mask, width+2, height+2, pitch);
+	allocateVolume(D_mask, width, height, pitch);
 	copyVolumeToDevice(mask, width, width, height, D_mask, pitch);
 
 	processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height);
@@ -79,7 +79,7 @@ void operationVolumeMultScalar(float* data, float scalar, unsigned int width, un
 	float* D_data;
 
 	unsigned int pitch;
-	allocateVolume(D_data, width+2, height+2, pitch);
+	allocateVolume(D_data, width, height, pitch);
 	copyVolumeToDevice(data, width, width, height, D_data, pitch);
 
 	processVol<opMul, VOL>(D_data, scalar, pitch, width, height);
@@ -96,10 +96,10 @@ void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned
 	float* D_data2;
 
 	unsigned int pitch;
-	allocateVolume(D_data1, width+2, height+2, pitch);
+	allocateVolume(D_data1, width, height, pitch);
 	copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
 
-	allocateVolume(D_data2, width+2, height+2, pitch);
+	allocateVolume(D_data2, width, height, pitch);
 	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
 
 	processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height);
@@ -116,7 +116,7 @@ void operationVolumeAddScalar(float* data, float scalar, unsigned int width, uns
 	float* D_data;
 
 	unsigned int pitch;
-	allocateVolume(D_data, width+2, height+2, pitch);
+	allocateVolume(D_data, width, height, pitch);
 	copyVolumeToDevice(data, width, width, height, D_data, pitch);
 
 	processVol<opAdd, VOL>(D_data, scalar, pitch, width, height);
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index 74d1bbf..cf3f10c 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -73,14 +73,14 @@ void EM::reset()
 
 bool EM::init()
 {
-	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
-	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 
-	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
-	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
-	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	// We can't precompute pixelWeights when using a volume mask
 #if 0 
@@ -94,22 +94,22 @@ bool EM::init()
 
 bool EM::precomputeWeights()
 {
-	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 #if 0
 	if (useSinogramMask) {
 		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
 	} else
 #endif
 	{
-		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
 		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
 	}
-	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+	processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 
 #if 0
 	if (useVolumeMask) {
 		// scale pixel weights with mask to zero out masked pixels
-		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 	}
 #endif
 
@@ -129,18 +129,18 @@ bool EM::iterate(unsigned int iterations)
 	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
 
 		// Do FP of volumeData 
-		zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+		zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 		callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f);
 
 		// Divide sinogram by FP (into projData)
-		processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opDividedBy>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 		// Do BP of projData into tmpData
-		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
 
 		// Multiply volumeData with tmpData divided by pixel weights
-		processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opMul2>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 
 	}
 
@@ -150,12 +150,12 @@ bool EM::iterate(unsigned int iterations)
 float EM::computeDiffNorm()
 {
 	// copy sinogram to projection data
-	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 	// do FP, subtracting projection from sinogram
 	if (useVolumeMask) {
-			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
 	} else {
 			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -164,7 +164,7 @@ float EM::computeDiffNorm()
 
 	// compute norm of D_projData
 
-	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	return sqrt(s);
 }
@@ -218,12 +218,12 @@ int main()
 	dims.iRaysPerDet = 1;
 	unsigned int volumePitch, sinoPitch;
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 	printf("pitch: %u\n", sinoPitch);
 	
 	unsigned int y, x;
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index 12086c0..9bec25d 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -61,12 +61,12 @@ __constant__ float gC_DetUX[g_MaxAngles];
 __constant__ float gC_DetUY[g_MaxAngles];
 
 
-static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_FanProjTexture.addressMode[0] = mode;
+	gT_FanProjTexture.addressMode[1] = mode;
 	gT_FanProjTexture.filterMode = cudaFilterModeLinear;
 	gT_FanProjTexture.normalized = false;
 
@@ -116,12 +116,12 @@ __global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int s
 		const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
 		const float fDen = fDetUX * fYD - fDetUY * fXD;
 		
-		const float fT = fNum / fDen + 1.0f;
+		const float fT = fNum / fDen;
 		fVal += tex2D(gT_FanProjTexture, fT, fA);
 		fA += 1.0f;
 	}
 
-	volData[(Y+1)*volPitch+X+1] += fVal;
+	volData[Y*volPitch+X] += fVal;
 }
 
 // supersampling version
@@ -171,7 +171,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
 				const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
 				const float fDen = fDetUX * fYD - fDetUY * fXD;
 		
-				const float fT = fNum / fDen + 1.0f;
+				const float fT = fNum / fDen;
 				fVal += tex2D(gT_FanProjTexture, fT, fA);
 				fY -= fSubStep;
 			}
@@ -180,7 +180,7 @@ __global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned in
 		fA += 1.0f;
 	}
 
-	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
 }
 
 
@@ -222,7 +222,7 @@ __global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDi
 	const float fT = fNum / fDen;
 	const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
 
-	volData[(Y+1)*volPitch+X+1] += fVal;
+	volData[Y*volPitch+X] += fVal;
 }
 
 // Weighted BP for use in fan beam FBP
@@ -268,12 +268,12 @@ __global__ void devFanBP_FBPWeighted(float* D_volData, unsigned int volPitch, un
 
 		const float fWeight = fXD*fXD + fYD*fYD;
 		
-		const float fT = fNum / fDen + 1.0f;
+		const float fT = fNum / fDen;
 		fVal += tex2D(gT_FanProjTexture, fT, fA) / fWeight;
 		fA += 1.0f;
 	}
 
-	volData[(Y+1)*volPitch+X+1] += fVal;
+	volData[Y*volPitch+X] += fVal;
 }
 
 
@@ -331,7 +331,7 @@ bool FanBP_FBPWeighted(float* D_volumeData, unsigned int volumePitch,
 	// TODO: process angles block by block
 	assert(dims.iProjAngles <= g_MaxAngles);
 
-	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	// transfer angles to constant memory
 	float* tmp = new float[dims.iProjAngles];
@@ -376,7 +376,7 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
                 const SDimensions& dims, const SFanProjection* angles)
 {
 	// only one angle
-	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
 
 	// transfer angle to constant memory
 #define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
@@ -442,10 +442,10 @@ int main()
 
 #undef ROTATE0
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
 	printf("pitch: %u\n", projPitch);
 
 	unsigned int y, x;
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index cf9f352..b24029c 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -67,8 +67,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
 	cudaMallocArray(&dataArray, &channelDesc, width, height);
 	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
 
-	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeBorder;
 	gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
 	gT_FanVolumeTexture.normalized = false;
 
@@ -129,8 +129,8 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig
 
 		// intersect ray with first slice
 
-		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
-		float fX = startSlice + 1.5f;
+		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+		float fX = startSlice + 0.5f;
 
 		int endSlice = startSlice + g_blockSlices;
 		if (endSlice > dims.iVolWidth)
@@ -148,7 +148,7 @@ __global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsig
 
 	}
 
-	projData[angle*projPitch+detector+1] += fVal;
+	projData[angle*projPitch+detector] += fVal;
 }
 
 
@@ -201,8 +201,8 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne
 
 		// intersect ray with first slice
 
-		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
-		float fY = startSlice + 1.5f;
+		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+		float fY = startSlice + 0.5f;
 
 		int endSlice = startSlice + g_blockSlices;
 		if (endSlice > dims.iVolHeight)
@@ -221,7 +221,7 @@ __global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigne
 
 	}
 
-	projData[angle*projPitch+detector+1] += fVal;
+	projData[angle*projPitch+detector] += fVal;
 }
 
 bool FanFP(float* D_volumeData, unsigned int volumePitch,
@@ -233,7 +233,7 @@ bool FanFP(float* D_volumeData, unsigned int volumePitch,
 	assert(dims.iProjAngles <= g_MaxAngles);
 
 	cudaArray* D_dataArray;
-	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 	// transfer angles to constant memory
 	float* tmp = new float[dims.iProjAngles];
@@ -325,10 +325,10 @@ int main()
 
 #undef ROTATE0
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
 	printf("pitch: %u\n", projPitch);
 
 	unsigned int y, x;
@@ -358,8 +358,8 @@ int main()
 			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
 	printf("cpu norm: %f\n", s);
 
-	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
-	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
+	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 	printf("gpu norm: %f\n", s);
 
 	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 79e4be7..75bd92c 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -209,7 +209,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
 }
 
 bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
-                int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+                int _iSourcePitch, int _iProjDets,
                 int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
                 cufftComplex * _pDevTargetComplex)
 {
@@ -221,7 +221,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
 
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
-		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX;
+		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch;
 		float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
 
 		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
@@ -241,7 +241,7 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
 
 bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
                  float * _pfRealTarget,
-                 int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+                 int _iTargetPitch, int _iProjDets,
                  int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
 {
 	float * pfDevRealFFTTarget = NULL;
@@ -264,7 +264,7 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
 		const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
-		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX;
+		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch;
 
 		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
 	}
diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h
index 55324e5..d4428a5 100644
--- a/cuda/2d/fft.h
+++ b/cuda/2d/fft.h
@@ -45,13 +45,13 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
                                 cufftComplex * _pDevComplexTarget);
 
 bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
-                int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+                int _iSourcePitch, int _iProjDets,
                 int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
                 cufftComplex * _pDevTargetComplex);
 
 bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
                  float * _pfRealTarget,
-                 int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+                 int _iTargetPitch, int _iProjDets,
                  int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
 
 void applyFilter(int _iProjectionCount, int _iFreqBinCount,
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index d202341..fc99102 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -57,12 +57,12 @@ __constant__ float gC_angle_sin[g_MaxAngles];
 __constant__ float gC_angle_cos[g_MaxAngles];
 __constant__ float gC_angle_offset[g_MaxAngles];
 
-static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_projTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_projTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_projTexture.addressMode[0] = mode;
+	gT_projTexture.addressMode[1] = mode;
 	gT_projTexture.filterMode = cudaFilterModeLinear;
 	gT_projTexture.normalized = false;
 
@@ -94,7 +94,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
 
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
-	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
 
 	if (offsets) {
 
@@ -123,7 +123,7 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
 
 	}
 
-	volData[(Y+1)*volPitch+X+1] += fVal;
+	volData[Y*volPitch+X] += fVal;
 }
 
 // supersampling version
@@ -150,7 +150,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
-	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
 
 	if (offsets) {
 
@@ -196,7 +196,7 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 
 	}
 
-	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+	volData[Y*volPitch+X] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
 }
 
 __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims)
@@ -218,7 +218,7 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
 	const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
 	const float fVal = tex2D(gT_projTexture, fT, 0.5f);
 
-	D_volData[(Y+1)*volPitch+X+1] += fVal;
+	D_volData[Y*volPitch+X] += fVal;
 }
 
 
@@ -232,7 +232,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 	float* angle_sin = new float[dims.iProjAngles];
 	float* angle_cos = new float[dims.iProjAngles];
 
-	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
 		angle_sin[i] = sinf(angles[i]);
@@ -302,8 +302,10 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
              unsigned int angle, const SDimensions& dims,
              const float* angles, const float* TOffsets)
 {
-	// only one angle
-	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+	// Only one angle.
+	// We need to Clamp to the border pixels instead of to zero, because
+	// SART weights with ray length.
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
 
 	float angle_sin = sinf(angles[angle]);
 	float angle_cos = cosf(angles[angle]);
@@ -346,10 +348,10 @@ int main()
 
 	unsigned int volumePitch, projPitch;
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
 	printf("pitch: %u\n", projPitch);
 
 	unsigned int y, x;
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index f811f98..097122b 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -74,8 +74,8 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
 	cudaMallocArray(&dataArray, &channelDesc, width, height);
 	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
 
-	gT_volumeTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_volumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_volumeTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_volumeTexture.addressMode[1] = cudaAddressModeBorder;
 	gT_volumeTexture.filterMode = cudaFilterModeLinear;
 	gT_volumeTexture.normalized = false;
 
@@ -153,8 +153,8 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
 
 	float fVal = 0.0f;
 	// project detector on slice
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
-	float fS = startSlice + 1.5f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+	float fS = startSlice + 0.5f;
 	int endSlice = startSlice + g_blockSlices;
 	if (endSlice > dims.iVolWidth)
 		endSlice = dims.iVolWidth;
@@ -189,7 +189,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
 
 	}
 
-	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
 // projection for angles that are roughly vertical
@@ -255,8 +255,8 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
 	fDistCorr *= outputScale;
 
 	float fVal = 0.0f;
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
-	float fS = startSlice+1.5f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+	float fS = startSlice+0.5f;
 	int endSlice = startSlice + g_blockSlices;
 	if (endSlice > dims.iVolHeight)
 		endSlice = dims.iVolHeight;
@@ -290,7 +290,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
 
 	}
 
-	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
 // projection for angles that are roughly horizontal
@@ -331,8 +331,8 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u
 
 	float fVal = 0.0f;
 	// project detector on slice
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
-	float fS = startSlice + 1.5f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
+	float fS = startSlice + 0.5f;
 	int endSlice = startSlice + g_blockSlices;
 	if (endSlice > dims.iVolWidth)
 		endSlice = dims.iVolWidth;
@@ -367,7 +367,7 @@ __global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, u
 
 	}
 
-	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
 
@@ -408,8 +408,8 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns
 	fDistCorr *= outputScale;
 
 	float fVal = 0.0f;
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
-	float fS = startSlice+1.5f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
+	float fS = startSlice+0.5f;
 	int endSlice = startSlice + g_blockSlices;
 	if (endSlice > dims.iVolHeight)
 		endSlice = dims.iVolHeight;
@@ -443,7 +443,7 @@ __global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, uns
 
 	}
 
-	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
 
@@ -457,7 +457,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 	assert(dims.iProjAngles <= g_MaxAngles);
 
 	cudaArray* D_dataArray;
-	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 
@@ -579,7 +579,7 @@ bool FP(float* D_volumeData, unsigned int volumePitch,
 	assert(dims.fDetScale > 0.9999f);
 
 	cudaArray* D_dataArray;
-	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 
@@ -682,10 +682,10 @@ int main()
 	dims.iRaysPerDet = 1;
 	unsigned int volumePitch, projPitch;
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
 	printf("pitch: %u\n", projPitch);
 
 	unsigned int y, x;
@@ -715,7 +715,7 @@ int main()
 			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
 	printf("cpu norm: %f\n", s);
 
-	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	//zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
 	printf("gpu norm: %f\n", s);
 
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index a40176d..7f499ce 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -39,14 +39,13 @@ $Id$
 
 namespace astraCUDA {
 
-
+// FIXME: Remove these functions. (Outdated)
 __global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
-	// Copy result down and left one pixel.
-	pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1];
+	pfOut[x] *= pfIn[x];
 }
 
 void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
@@ -106,18 +105,15 @@ void SART::reset()
 bool SART::init()
 {
 	if (useVolumeMask) {
-		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
-		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 	}
 
-	// HACK: D_projData consists of two lines. The first is used padded,
-	// the second unpadded. This is to satisfy the alignment requirements
-	// of resp. FP and BP_SART.
-	allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch);
-	zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1);
+	allocateVolume(D_projData, dims.iProjDets, 1, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets, 1);
 	
-	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
-	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 
 	// We can't precompute lineWeights when using a mask
 	if (!useVolumeMask)
@@ -142,23 +138,23 @@ bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount)
 
 bool SART::precomputeWeights()
 {
-	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 	if (useVolumeMask) {
 		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
 	} else {
 		// Allocate tmpData temporarily
-		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
-		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 
 
-		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
 
 
 		cudaFree(D_tmpData);
 		D_tmpData = 0;
 	}
-	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+	processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 
 	return true;
 }
@@ -181,12 +177,12 @@ bool SART::iterate(unsigned int iterations)
 		}
 
 		// copy one line of sinogram to projection data
-		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice);
+		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), 1, cudaMemcpyDeviceToDevice);
 
 		// do FP, subtracting projection from sinogram
 		if (useVolumeMask) {
-				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 				callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);
 		} else {
 				callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f);
@@ -197,17 +193,17 @@ bool SART::iterate(unsigned int iterations)
 		if (useVolumeMask) {
 			// BP, mask, and add back
 			// TODO: Try putting the masking directly in the BP
-			zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+			zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 			callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle);
-			processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opAddMul>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 		} else {
 			callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);
 		}
 
 		if (useMinConstraint)
-			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
 		if (useMaxConstraint)
-			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 		iteration++;
 
@@ -220,16 +216,16 @@ float SART::computeDiffNorm()
 {
 	unsigned int pPitch;
 	float *D_p;
-	allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch);
-	zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_p, dims.iProjDets, dims.iProjAngles, pPitch);
+	zeroVolume(D_p, pPitch, dims.iProjDets, dims.iProjAngles);
 
 	// copy sinogram to D_p
-	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 	// do FP, subtracting projection from sinogram
 	if (useVolumeMask) {
-			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
 	} else {
 			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -237,7 +233,7 @@ float SART::computeDiffNorm()
 
 
 	// compute norm of D_p
-	float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles);
 
 	cudaFree(D_p);
 
@@ -267,11 +263,11 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
 {
 	if (angles) {
 		assert(!fanProjs);
-		return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+		return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,
 		               angle, dims, angles, TOffsets);
 	} else {
 		assert(fanProjs);
-		return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+		return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch,
 		                  angle, dims, fanProjs);
 	}
 
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index 31954e4..eb65962 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -88,17 +88,17 @@ void SIRT::reset()
 
 bool SIRT::init()
 {
-	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
-	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_pixelWeight, dims.iVolWidth, dims.iVolHeight, pixelPitch);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 
-	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
-	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_tmpData, dims.iVolWidth, dims.iVolHeight, tmpPitch);
+	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 
-	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
-	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_projData, dims.iProjDets, dims.iProjAngles, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 	
-	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
-	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_lineWeight, dims.iProjDets, dims.iProjAngles, linePitch);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 
 	// We can't precompute lineWeights and pixelWeights when using a mask
 	if (!useVolumeMask && !useSinogramMask)
@@ -110,33 +110,33 @@ bool SIRT::init()
 
 bool SIRT::precomputeWeights()
 {
-	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 	if (useVolumeMask) {
 		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
 	} else {
-		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opSet>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
 	}
-	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+	processVol<opInvert>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
 
 	if (useSinogramMask) {
 		// scale line weights with sinogram mask to zero out masked sinogram pixels
-		processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opMul>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
 	}
 
 
-	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 	if (useSinogramMask) {
 		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
 	} else {
-		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opSet>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
 		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
 	}
-	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+	processVol<opInvert>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 
 	if (useVolumeMask) {
 		// scale pixel weights with mask to zero out masked pixels
-		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opMul>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
 	}
 
 	return true;
@@ -160,7 +160,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD
 	freeMinMaxMasks = true;
 	bool ok = true;
 	if (pfMinMaskData) {
-		allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch);
+		allocateVolume(D_minMaskData, dims.iVolWidth, dims.iVolHeight, minMaskPitch);
 		ok = copyVolumeToDevice(pfMinMaskData, iPitch,
 		                        dims.iVolWidth, dims.iVolHeight,
 		                        D_minMaskData, minMaskPitch);
@@ -169,7 +169,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD
 		return false;
 
 	if (pfMaxMaskData) {
-		allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch);
+		allocateVolume(D_maxMaskData, dims.iVolWidth, dims.iVolHeight, maxMaskPitch);
 		ok = copyVolumeToDevice(pfMaxMaskData, iPitch,
 		                        dims.iVolWidth, dims.iVolHeight,
 		                        D_maxMaskData, maxMaskPitch);
@@ -191,33 +191,33 @@ bool SIRT::iterate(unsigned int iterations)
 	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
 
 		// copy sinogram to projection data
-		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 		// do FP, subtracting projection from sinogram
 		if (useVolumeMask) {
-				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 				callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
 		} else {
 				callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
 		}
 
-		processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
+		processVol<opMul>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
 
-		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 
 		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
 
-		processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		processVol<opAddMul>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
 		if (useMinConstraint)
-			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMin>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
 		if (useMaxConstraint)
-			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMax>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
 		if (D_minMaskData)
-			processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMinMask>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 		if (D_maxMaskData)
-			processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+			processVol<opClampMaxMask>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 	}
 
 	return true;
@@ -226,12 +226,12 @@ bool SIRT::iterate(unsigned int iterations)
 float SIRT::computeDiffNorm()
 {
 	// copy sinogram to projection data
-	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice);
 
 	// do FP, subtracting projection from sinogram
 	if (useVolumeMask) {
-			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
-			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice);
+			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
 			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
 	} else {
 			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
@@ -240,7 +240,7 @@ float SIRT::computeDiffNorm()
 
 	// compute norm of D_projData
 
-	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	return sqrt(s);
 }
@@ -300,12 +300,12 @@ int main()
 	dims.iRaysPerDet = 1;
 	unsigned int volumePitch, sinoPitch;
 
-	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
-	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	allocateVolume(D_volumeData, dims.iVolWidth, dims.iVolHeight, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth, dims.iVolHeight);
 	printf("pitch: %u\n", volumePitch);
 
-	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
-	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	allocateVolume(D_sinoData, dims.iProjDets, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets, dims.iProjAngles);
 	printf("pitch: %u\n", sinoPitch);
 	
 	unsigned int y, x;
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
index 06f6714..8bb2f2f 100644
--- a/cuda/2d/util.cu
+++ b/cuda/2d/util.cu
@@ -36,11 +36,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
 		unsigned int width, unsigned int height,
 		float* outD_data, unsigned int out_pitch)
 {
-	// TODO: a full memset isn't necessary. Only the edges.
 	cudaError_t err;
-	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2);
-	ASTRA_CUDA_ASSERT(err);
-	err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
 	ASTRA_CUDA_ASSERT(err);
 	assert(err == cudaSuccess);
 	return true;
@@ -50,7 +47,7 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
 		unsigned int width, unsigned int height,
 		float* inD_data, unsigned int in_pitch)
 {
-	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
 	ASTRA_CUDA_ASSERT(err);
 	return true;
 }
@@ -60,7 +57,7 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
 		unsigned int width, unsigned int height,
 		float* inD_data, unsigned int in_pitch)
 {   
-	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
 	ASTRA_CUDA_ASSERT(err);
 	return true;
 }
@@ -69,11 +66,8 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
 		unsigned int width, unsigned int height,
 		float* outD_data, unsigned int out_pitch)
 {   
-	// TODO: a full memset isn't necessary. Only the edges.
 	cudaError_t err;
-	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height);
-	ASTRA_CUDA_ASSERT(err);
-	err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
 	ASTRA_CUDA_ASSERT(err);
 	return true;
 }
@@ -132,8 +126,7 @@ __global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)
 
 __global__ void reduce2D(float *g_idata, float *g_odata,
                          unsigned int pitch,
-                         unsigned int nx, unsigned int ny,
-                         unsigned int padX, unsigned int padY)
+                         unsigned int nx, unsigned int ny)
 {
 	extern __shared__ float sdata[];
 	const unsigned int tidx = threadIdx.x;
@@ -145,11 +138,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,
 
 	sdata[tid] = 0;
 
-	if (x >= padX && x < padX + nx) {
+	if (x < nx) {
 
-		while (y < padY + ny) {
-			if (y >= padY)
-				sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
+		while (y < ny) {
+			sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
 			y += 16 * gridDim.y;
 		}
 
@@ -180,11 +172,10 @@ __global__ void reduce2D(float *g_idata, float *g_odata,
 }
 
 float dotProduct2D(float* D_data, unsigned int pitch,
-                   unsigned int width, unsigned int height,
-                   unsigned int padX, unsigned int padY)
+                   unsigned int width, unsigned int height)
 {
-	unsigned int bx = ((width+padX) + 15) / 16;
-	unsigned int by = ((height+padY) + 127) / 128;
+	unsigned int bx = (width + 15) / 16;
+	unsigned int by = (height + 127) / 128;
 	unsigned int shared_mem2 = sizeof(float) * 16 * 16;
 
 	dim3 dimBlock2(16, 16);
@@ -192,26 +183,27 @@ float dotProduct2D(float* D_data, unsigned int pitch,
 
 	float* D_buf;
 	cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) );
+	float* D_res = D_buf + (bx*by);
 
 	// Step 1: reduce 2D from image to a single vector, taking sum of squares
 
-	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY);
+	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);
 	cudaTextForceKernelsCompletion();
 
 	// Step 2: reduce 1D: add up elements in vector
 	if (bx * by > 512)
-		reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by);
+		reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_res, bx*by);
 	else if (bx * by > 128)
-		reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by);
+		reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_res, bx*by);
 	else if (bx * by > 32)
-		reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by);
+		reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_res, bx*by);
 	else if (bx * by > 8)
-		reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by);
+		reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_res, bx*by);
 	else
-		reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by);
+		reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_res, bx*by);
 
 	float x;
-	cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost);
+	cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);
 
 	cudaTextForceKernelsCompletion();
 
diff --git a/cuda/2d/util.h b/cuda/2d/util.h
index d31e2eb..ed4a45c 100644
--- a/cuda/2d/util.h
+++ b/cuda/2d/util.h
@@ -82,8 +82,8 @@ void reportCudaError(cudaError_t err);
 
 
 float dotProduct2D(float* D_data, unsigned int pitch,
-                   unsigned int width, unsigned int height,
-                   unsigned int padX, unsigned int padY);
+                   unsigned int width, unsigned int height);
+
 
 }