From c72bc7cd47ecb5665a287fb88e101f88118f5232 Mon Sep 17 00:00:00 2001 From: Willem Jan Palenstijn Date: Wed, 16 Apr 2014 11:13:40 +0000 Subject: Split up processVol in Vol/Sino cases --- cuda/2d/algo.cu | 17 ++- cuda/2d/arith.cu | 248 ++++++++++++++++++++++++++----------- cuda/2d/arith.h | 38 +++--- cuda/2d/astra.cu | 29 ++--- cuda/2d/cgls.cu | 14 +-- cuda/2d/darthelper.cu | 41 ++++-- cuda/2d/em.cu | 12 +- cuda/2d/sart.cu | 14 +-- cuda/2d/sirt.cu | 34 ++--- cuda/2d/util.cu | 29 +++-- cuda/2d/util.h | 8 +- src/CudaDataOperationAlgorithm.cpp | 33 ++--- 12 files changed, 317 insertions(+), 200 deletions(-) diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu index 333481a..33ca1a3 100644 --- a/cuda/2d/algo.cu +++ b/cuda/2d/algo.cu @@ -264,20 +264,18 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit return false; bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, - dims.iProjDets, - dims.iProjAngles, + dims, D_sinoData, sinoPitch); if (!ok) return false; // rescale sinogram to adjust for pixel size - processVol(D_sinoData, fSinogramScale, + processSino(D_sinoData, fSinogramScale, //1.0f/(fPixelSize*fPixelSize), - sinoPitch, - dims.iProjDets, dims.iProjAngles); + sinoPitch, dims); ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch, - dims.iVolWidth, dims.iVolHeight, + dims, D_volumeData, volumePitch); if (!ok) return false; @@ -289,7 +287,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit return false; ok = copyVolumeToDevice(pfVolMask, iVolMaskPitch, - dims.iVolWidth, dims.iVolHeight, + dims, D_maskData, maskPitch); if (!ok) return false; @@ -300,7 +298,7 @@ bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPit return false; ok = copySinogramToDevice(pfSinoMask, iSinoMaskPitch, - dims.iProjDets, dims.iProjAngles, + dims, D_smaskData, smaskPitch); if (!ok) return false; @@ -313,8 +311,7 @@ bool ReconAlgo::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const { bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, - dims.iVolWidth, - dims.iVolHeight, + dims, D_volumeData, volumePitch); if (!ok) return false; diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu index 42c2c98..9544026 100644 --- a/cuda/2d/arith.cu +++ b/cuda/2d/arith.cu @@ -279,55 +279,57 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, - template -void processVolCopy(float* out, unsigned int width, unsigned int height) +void processVolCopy(float* out, const SDimensions& dims) { float* D_out; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); - processVol(D_out, pitch, width, height); + processVol(D_out, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); } template -void processVolCopy(float* out, float param, unsigned int width, unsigned int height) +void processVolCopy(float* out, float param, const SDimensions& dims) { float* D_out; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); - processVol(D_out, param, pitch, width, height); + processVol(D_out, param, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); } template -void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) +void processVolCopy(float* out1, float* out2, float param1, float param2, const SDimensions& dims) { float* D_out1; float* D_out2; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out1, width, height, pitch); - copyVolumeToDevice(out1, width, width, height, D_out1, pitch); - allocateVolume(D_out2, width, height, pitch); - copyVolumeToDevice(out2, width, width, height, D_out2, pitch); + allocateVolumeData(D_out1, pitch, dims); + copyVolumeToDevice(out1, width, dims, D_out1, pitch); + allocateVolumeData(D_out2, pitch, dims); + copyVolumeToDevice(out2, width, dims, D_out2, pitch); - processVol(D_out1, D_out2, param1, param2, pitch, width, height); + processVol(D_out1, D_out2, param1, param2, pitch, dims); - copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); - copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); + copyVolumeFromDevice(out1, width, dims, D_out1, pitch); + copyVolumeFromDevice(out2, width, dims, D_out2, pitch); cudaFree(D_out1); cudaFree(D_out2); @@ -335,63 +337,66 @@ void processVolCopy(float* out1, float* out2, float param1, float param2, unsign template -void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) +void processVolCopy(float* out, const float* in, const SDimensions& dims) { float* D_out; float* D_in; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width, height, pitch); - copyVolumeToDevice(in, width, width, height, D_in, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); + allocateVolumeData(D_in, pitch, dims); + copyVolumeToDevice(in, width, dims, D_in, pitch); - processVol(D_out, D_in, pitch, width, height); + processVol(D_out, D_in, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); cudaFree(D_in); } template -void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) +void processVolCopy(float* out, const float* in, float param, const SDimensions& dims) { float* D_out; float* D_in; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in, width, height, pitch); - copyVolumeToDevice(in, width, width, height, D_in, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); + allocateVolumeData(D_in, pitch, dims); + copyVolumeToDevice(in, width, dims, D_in, pitch); - processVol(D_out, D_in, param, pitch, width, height); + processVol(D_out, D_in, param, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); cudaFree(D_in); } template -void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) +void processVolCopy(float* out, const float* in1, const float* in2, const SDimensions& dims) { float* D_out; float* D_in1; float* D_in2; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width, height, pitch); - copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width, height, pitch); - copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); + allocateVolumeData(D_in1, pitch, dims); + copyVolumeToDevice(in1, width, dims, D_in1, pitch); + allocateVolumeData(D_in2, pitch, dims); + copyVolumeToDevice(in2, width, dims, D_in2, pitch); - processVol(D_out, D_in1, D_in2, pitch, width, height); + processVol(D_out, D_in1, D_in2, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); cudaFree(D_in1); @@ -399,23 +404,24 @@ void processVolCopy(float* out, const float* in1, const float* in2, unsigned int } template -void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) +void processVolCopy(float* out, const float* in1, const float* in2, float param, const SDimensions& dims) { float* D_out; float* D_in1; float* D_in2; + size_t width = dims.iVolWidth; unsigned int pitch; - allocateVolume(D_out, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_out, pitch); - allocateVolume(D_in1, width, height, pitch); - copyVolumeToDevice(in1, width, width, height, D_in1, pitch); - allocateVolume(D_in2, width, height, pitch); - copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + allocateVolumeData(D_out, pitch, dims); + copyVolumeToDevice(out, width, dims, D_out, pitch); + allocateVolumeData(D_in1, pitch, dims); + copyVolumeToDevice(in1, width, dims, D_in1, pitch); + allocateVolumeData(D_in2, pitch, dims); + copyVolumeToDevice(in2, width, dims, D_in2, pitch); - processVol(D_out, D_in1, D_in2, param, pitch, width, height); + processVol(D_out, D_in1, D_in2, param, pitch, dims); - copyVolumeFromDevice(out, width, width, height, D_out, pitch); + copyVolumeFromDevice(out, width, dims, D_out, pitch); cudaFree(D_out); cudaFree(D_in1); @@ -429,9 +435,8 @@ void processVolCopy(float* out, const float* in1, const float* in2, float param, - template -void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+511)/512); @@ -442,7 +447,7 @@ void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned i } template -void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -453,7 +458,7 @@ void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int wid } template -void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -465,7 +470,7 @@ void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsi template -void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -476,7 +481,7 @@ void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned in } template -void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -487,7 +492,7 @@ void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitc } template -void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -498,7 +503,7 @@ void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fPar } template -void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -515,6 +520,96 @@ void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned i +template +void processVol(float* out, unsigned int pitch, const SDimensions& dims) +{ + processData(out, pitch, dims.iVolWidth, dims.iVolHeight); +} + +template +void processVol(float* out, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, param, pitch, dims.iVolWidth, dims.iVolHeight); +} + +template +void processVol(float* out1, float* out2, float param1, float param2, unsigned int pitch, const SDimensions& dims) +{ + processData(out1, out2, param1, param2, pitch, dims.iVolWidth, dims.iVolHeight); +} + + +template +void processVol(float* out, const float* in, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in, pitch, dims.iVolWidth, dims.iVolHeight); +} + +template +void processVol(float* out, const float* in, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in, param, pitch, dims.iVolWidth, dims.iVolHeight); +} + +template +void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in1, in2, pitch, dims.iVolWidth, dims.iVolHeight); +} + +template +void processVol(float* out, const float* in1, const float* in2, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in2, in2, param, pitch, dims.iVolWidth, dims.iVolHeight); +} + + + + +template +void processSino(float* out, unsigned int pitch, const SDimensions& dims) +{ + processData(out, pitch, dims.iProjDets, dims.iProjAngles); +} + +template +void processSino(float* out, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, param, pitch, dims.iProjDets, dims.iProjAngles); +} + +template +void processSino(float* out1, float* out2, float param1, float param2, unsigned int pitch, const SDimensions& dims) +{ + processData(out1, out2, param1, param2, pitch, dims.iProjDets, dims.iProjAngles); +} + + +template +void processSino(float* out, const float* in, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in, pitch, dims.iProjDets, dims.iProjAngles); +} + +template +void processSino(float* out, const float* in, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in, param, pitch, dims.iProjDets, dims.iProjAngles); +} + +template +void processSino(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in1, in2, pitch, dims.iProjDets, dims.iProjAngles); +} + +template +void processSino(float* out, const float* in1, const float* in2, float param, unsigned int pitch, const SDimensions& dims) +{ + processData(out, in2, in2, param, pitch, dims.iProjDets, dims.iProjAngles); +} + + @@ -808,45 +903,52 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit #define INST_DFtoD(name) \ - template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in, float param, const SDimensions& dims); \ + template void processVol(float* out, const float* in, float param, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, const float* in, float param, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); #define INST_DtoD(name) \ - template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in, const SDimensions& dims); \ + template void processVol(float* out, const float* in, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, const float* in, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); #define INST_DDtoD(name) \ - template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in1, const float* in2, const SDimensions& dims); \ + template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); #define INST_DDFtoD(name) \ - template void processVolCopy(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ - template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const float* in1, const float* in2, float fParam, const SDimensions& dims); \ + template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); #define INST_toD(name) \ - template void processVolCopy(float* out, unsigned int width, unsigned int height); \ - template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, const SDimensions& dims); \ + template void processVol(float* out, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); #define INST_FtoD(name) \ - template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); \ - template void processVol(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out, float param, const SDimensions& dims); \ + template void processVol(float* out, float param, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out, float param, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out, float param, const SDimensions3D& dims); #define INST_FFtoDD(name) \ - template void processVolCopy(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ - template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVolCopy(float* out1, float* out2, float fParam1, float fParam2, const SDimensions& dims); \ + template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, const SDimensions& dims); \ + template void processSino(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, const SDimensions& dims); \ template void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ template void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h index d745aef..c32a63a 100644 --- a/cuda/2d/arith.h +++ b/cuda/2d/arith.h @@ -55,21 +55,29 @@ struct opSetMaskedValues; struct opMulMask; -template void processVolCopy(float* out, unsigned int width, unsigned int height); -template void processVolCopy(float* out, float param, unsigned int width, unsigned int height); -template void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); -template void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); - -template void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); -template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); +template void processVolCopy(float* out, const SDimensions& dims); +template void processVolCopy(float* out, float param, const SDimensions& dims); +template void processVolCopy(float* out1, float* out2, float param1, float param2, const SDimensions& dims); +template void processVolCopy(float* out, const float* in, const SDimensions& dims); +template void processVolCopy(float* out, const float* in, float param, const SDimensions& dims); +template void processVolCopy(float* out, const float* in1, const float* in2, const SDimensions& dims); +template void processVolCopy(float* out, const float* in1, const float* in2, float param, const SDimensions& dims); + +template void processVol(float* out, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out, float fParam, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out, const float* in, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out, const float* in, float fParam, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, const SDimensions& dims); +template void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims); + +template void processSino(float* out, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out, float fParam, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out, const float* in, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out, const float* in, float fParam, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, const SDimensions& dims); +template void processSino(float* out, const float* in1, const float* in2, unsigned int pitch, const SDimensions& dims); template void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); template void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu index 4e69e8f..15e487c 100644 --- a/cuda/2d/astra.cu +++ b/cuda/2d/astra.cu @@ -268,17 +268,15 @@ bool AstraFBP::setSinogram(const float* pfSinogram, return false; bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, - pData->dims.iProjDets, - pData->dims.iProjAngles, + pData->dims, pData->D_sinoData, pData->sinoPitch); if (!ok) return false; // rescale sinogram to adjust for pixel size - processVol(pData->D_sinoData, + processSino(pData->D_sinoData, 1.0f/(pData->fPixelSize*pData->fPixelSize), - pData->sinoPitch, - pData->dims.iProjDets, pData->dims.iProjAngles); + pData->sinoPitch, pData->dims); pData->setStartReconstruction = false; @@ -390,8 +388,7 @@ bool AstraFBP::run() processVol(pData->D_volumeData, (M_PI / 2.0f) / (float)pData->dims.iProjAngles, - pData->volumePitch, - pData->dims.iVolWidth, pData->dims.iVolHeight); + pData->volumePitch, pData->dims); return true; } @@ -402,8 +399,7 @@ bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstr return false; bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, - pData->dims.iVolWidth, - pData->dims.iVolHeight, + pData->dims, pData->D_volumeData, pData->volumePitch); if (!ok) return false; @@ -682,7 +678,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram, } ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, - dims.iVolWidth, dims.iVolHeight, + dims, D_volumeData, volumePitch); if (!ok) { cudaFree(D_volumeData); @@ -699,8 +695,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram, } ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, - dims.iProjDets, - dims.iProjAngles, + dims, D_sinoData, sinoPitch); if (!ok) { cudaFree(D_volumeData); @@ -769,7 +764,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, } ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, - dims.iVolWidth, dims.iVolHeight, + dims, D_volumeData, volumePitch); if (!ok) { cudaFree(D_volumeData); @@ -808,8 +803,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, } ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, - dims.iProjDets, - dims.iProjAngles, + dims, D_sinoData, sinoPitch); if (!ok) { cudaFree(D_volumeData); @@ -880,7 +874,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, } ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, - dims.iVolWidth, dims.iVolHeight, + dims, D_volumeData, volumePitch); if (!ok) { cudaFree(D_volumeData); @@ -899,8 +893,7 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, } ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, - dims.iProjDets, - dims.iProjAngles, + dims, D_sinoData, sinoPitch); if (!ok) { cudaFree(D_volumeData); diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu index f4175e1..fce8beb 100644 --- a/cuda/2d/cgls.cu +++ b/cuda/2d/cgls.cu @@ -126,7 +126,7 @@ bool CGLS::iterate(unsigned int iterations) if (useVolumeMask) { // Use z as temporary storage here since it is unused cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_z, D_maskData, zPitch, dims); callFP(D_z, zPitch, D_r, rPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f); @@ -137,7 +137,7 @@ bool CGLS::iterate(unsigned int iterations) zeroVolumeData(D_p, pPitch, dims); callBP(D_p, pPitch, D_r, rPitch); if (useVolumeMask) - processVol(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_p, D_maskData, pPitch, dims); gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight); @@ -158,24 +158,24 @@ bool CGLS::iterate(unsigned int iterations) float alpha = gamma / ww; // x += alpha*p - processVol(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_p, alpha, volumePitch, dims); // r -= alpha*w - processVol(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); + processSino(D_r, D_w, -alpha, rPitch, dims); // z = A'*r zeroVolumeData(D_z, zPitch, dims); callBP(D_z, zPitch, D_r, rPitch); if (useVolumeMask) - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_z, D_maskData, zPitch, dims); float beta = 1.0f / gamma; gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight); beta *= gamma; // p = z + beta*p - processVol(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_p, D_z, beta, pPitch, dims); } @@ -194,7 +194,7 @@ float CGLS::computeDiffNorm() // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_z, D_maskData, zPitch, dims); callFP(D_z, zPitch, D_w, wPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f); diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu index 064913a..9b5141b 100644 --- a/cuda/2d/darthelper.cu +++ b/cuda/2d/darthelper.cu @@ -54,14 +54,19 @@ void roiSelect(float* out, float radius, unsigned int width, unsigned int height float* D_data; unsigned int pitch; - allocateVolume(D_data, width, height, pitch); - copyVolumeToDevice(out, width, width, height, D_data, pitch); + // We abuse dims here... + SDimensions dims; + dims.iVolWidth = width; + dims.iVolHeight = width; + + allocateVolumeData(D_data, pitch, dims); + copyVolumeToDevice(out, width, dims, D_data, pitch); dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); devRoiSelect<<>>(D_data, radius, pitch, width, height); - copyVolumeFromDevice(out, width, width, height, D_data, pitch); + copyVolumeFromDevice(out, width, dims, D_data, pitch); cudaFree(D_data); } @@ -237,11 +242,16 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne float* D_maskData; unsigned int pitch; - allocateVolume(D_segmentationData, width, height, pitch); - copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch); + // We abuse dims here... + SDimensions dims; + dims.iVolWidth = width; + dims.iVolHeight = width; + + allocateVolumeData(D_segmentationData, pitch, dims); + copyVolumeToDevice(segmentation, width, dims, D_segmentationData, pitch); - allocateVolume(D_maskData, width, height, pitch); - zeroVolume(D_maskData, pitch, width, height); + allocateVolumeData(D_maskData, pitch, dims); + zeroVolumeData(D_maskData, pitch, dims); dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -255,7 +265,7 @@ void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigne else devADartMaskRadius<<>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height); - copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch); + copyVolumeFromDevice(mask, width, dims, D_maskData, pitch); cudaFree(D_segmentationData); cudaFree(D_maskData); @@ -320,11 +330,16 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un float* D_outData; unsigned int pitch; - allocateVolume(D_inData, width, height, pitch); - copyVolumeToDevice(in, width, width, height, D_inData, pitch); + // We abuse dims here... + SDimensions dims; + dims.iVolWidth = width; + dims.iVolHeight = width; + + allocateVolumeData(D_inData, pitch, dims); + copyVolumeToDevice(in, width, dims, D_inData, pitch); - allocateVolume(D_outData, width, height, pitch); - zeroVolume(D_outData, pitch, width, height); + allocateVolumeData(D_outData, pitch, dims); + zeroVolumeData(D_outData, pitch, dims); dim3 blockSize(16,16); dim3 gridSize((width+15)/16, (height+15)/16); @@ -333,7 +348,7 @@ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, un else devDartSmoothingRadius<<>>(D_outData, D_inData, b, radius, pitch, width, height); - copyVolumeFromDevice(out, width, width, height, D_outData, pitch); + copyVolumeFromDevice(out, width, dims, D_outData, pitch); cudaFree(D_outData); cudaFree(D_inData); diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu index b281516..c75f250 100644 --- a/cuda/2d/em.cu +++ b/cuda/2d/em.cu @@ -101,15 +101,15 @@ bool EM::precomputeWeights() } else #endif { - processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + processSino(D_projData, 1.0f, projPitch, dims); callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); } - processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, pixelPitch, dims); #if 0 if (useVolumeMask) { // scale pixel weights with mask to zero out masked pixels - processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, D_maskData, pixelPitch, dims); } #endif @@ -133,14 +133,14 @@ bool EM::iterate(unsigned int iterations) callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f); // Divide sinogram by FP (into projData) - processVol(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); + processSino(D_projData, D_sinoData, projPitch, dims); // Do BP of projData into tmpData zeroVolumeData(D_tmpData, tmpPitch, dims); callBP(D_tmpData, tmpPitch, D_projData, projPitch); // Multiply volumeData with tmpData divided by pixel weights - processVol(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims); } @@ -155,7 +155,7 @@ float EM::computeDiffNorm() // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, D_maskData, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index 79c00ef..048661f 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -150,14 +150,14 @@ bool SART::precomputeWeights() zeroVolumeData(D_tmpData, tmpPitch, dims); - processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, 1.0f, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); cudaFree(D_tmpData); D_tmpData = 0; } - processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + processSino(D_lineWeight, linePitch, dims); return true; } @@ -185,7 +185,7 @@ bool SART::iterate(unsigned int iterations) // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, D_maskData, tmpPitch, dims); callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f); } else { callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f); @@ -198,15 +198,15 @@ bool SART::iterate(unsigned int iterations) // TODO: Try putting the masking directly in the BP zeroVolumeData(D_tmpData, tmpPitch, dims); callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle); - processVol(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_maskData, D_tmpData, volumePitch, dims); } else { callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle); } if (useMinConstraint) - processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMinConstraint, volumePitch, dims); if (useMaxConstraint) - processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMaxConstraint, volumePitch, dims); iteration++; @@ -228,7 +228,7 @@ float SART::computeDiffNorm() // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, D_maskData, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu index 1b0891a..c402864 100644 --- a/cuda/2d/sirt.cu +++ b/cuda/2d/sirt.cu @@ -114,14 +114,14 @@ bool SIRT::precomputeWeights() if (useVolumeMask) { callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); } else { - processVol(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, 1.0f, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); } - processVol(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + processSino(D_lineWeight, linePitch, dims); if (useSinogramMask) { // scale line weights with sinogram mask to zero out masked sinogram pixels - processVol(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); + processSino(D_lineWeight, D_smaskData, linePitch, dims); } @@ -129,14 +129,14 @@ bool SIRT::precomputeWeights() if (useSinogramMask) { callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); } else { - processVol(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + processSino(D_projData, 1.0f, projPitch, dims); callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); } - processVol(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, pixelPitch, dims); if (useVolumeMask) { // scale pixel weights with mask to zero out masked pixels - processVol(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_pixelWeight, D_maskData, pixelPitch, dims); } return true; @@ -162,7 +162,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD if (pfMinMaskData) { allocateVolumeData(D_minMaskData, minMaskPitch, dims); ok = copyVolumeToDevice(pfMinMaskData, iPitch, - dims.iVolWidth, dims.iVolHeight, + dims, D_minMaskData, minMaskPitch); } if (!ok) @@ -171,7 +171,7 @@ bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskD if (pfMaxMaskData) { allocateVolumeData(D_maxMaskData, maxMaskPitch, dims); ok = copyVolumeToDevice(pfMaxMaskData, iPitch, - dims.iVolWidth, dims.iVolHeight, + dims, D_maxMaskData, maxMaskPitch); } if (!ok) @@ -196,28 +196,28 @@ bool SIRT::iterate(unsigned int iterations) // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, D_maskData, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); } - processVol(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); + processSino(D_projData, D_lineWeight, projPitch, dims); zeroVolumeData(D_tmpData, tmpPitch, dims); callBP(D_tmpData, tmpPitch, D_projData, projPitch); - processVol(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims); if (useMinConstraint) - processVol(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMinConstraint, volumePitch, dims); if (useMaxConstraint) - processVol(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, fMaxConstraint, volumePitch, dims); if (D_minMaskData) - processVol(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_minMaskData, volumePitch, dims); if (D_maxMaskData) - processVol(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_volumeData, D_maxMaskData, volumePitch, dims); } return true; @@ -231,7 +231,7 @@ float SIRT::computeDiffNorm() // do FP, subtracting projection from sinogram if (useVolumeMask) { cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); - processVol(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + processVol(D_tmpData, D_maskData, tmpPitch, dims); callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); } else { callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); @@ -332,7 +332,7 @@ int main() delete[] angle; - copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copyVolumeFromDevice(img, dims.iVolWidth, dims, D_volumeData, volumePitch); saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index d5cbe44..8d3b625 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -33,9 +33,12 @@ $Id$ namespace astraCUDA { bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* outD_data, unsigned int out_pitch) { + size_t width = dims.iVolWidth; + size_t height = dims.iVolHeight; + // TODO: memory order cudaError_t err; err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); ASTRA_CUDA_ASSERT(err); @@ -44,9 +47,12 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, } bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* inD_data, unsigned int in_pitch) { + size_t width = dims.iVolWidth; + size_t height = dims.iVolHeight; + // TODO: memory order cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); ASTRA_CUDA_ASSERT(err); return true; @@ -54,18 +60,24 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* inD_data, unsigned int in_pitch) -{ +{ + size_t width = dims.iProjDets; + size_t height = dims.iProjAngles; + // TODO: memory order cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); ASTRA_CUDA_ASSERT(err); return true; } bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* outD_data, unsigned int out_pitch) -{ +{ + size_t width = dims.iProjDets; + size_t height = dims.iProjAngles; + // TODO: memory order cudaError_t err; err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); ASTRA_CUDA_ASSERT(err); @@ -99,25 +111,21 @@ void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned in bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims) { - // TODO: memory order return allocateVolume(D_ptr, dims.iVolWidth, dims.iVolHeight, pitch); } bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims) { - // TODO: memory order return allocateVolume(D_ptr, dims.iProjDets, dims.iProjAngles, pitch); } void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims) { - // TODO: memory order zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight); } void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims) { - // TODO: memory order zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles); } @@ -198,6 +206,7 @@ __global__ void reduce2D(float *g_idata, float *g_odata, float dotProduct2D(float* D_data, unsigned int pitch, unsigned int width, unsigned int height) { +#warning FIX MEMORY ORDER unsigned int bx = (width + 15) / 16; unsigned int by = (height + 127) / 128; unsigned int shared_mem2 = sizeof(float) * 16 * 16; diff --git a/cuda/2d/util.h b/cuda/2d/util.h index 3cffa08..83cb794 100644 --- a/cuda/2d/util.h +++ b/cuda/2d/util.h @@ -60,16 +60,16 @@ $Id$ namespace astraCUDA { bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* outD_data, unsigned int out_pitch); bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* inD_data, unsigned int in_pitch); bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* inD_data, unsigned int in_pitch); bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, - unsigned int width, unsigned int height, + const SDimensions& dims, float* outD_data, unsigned int out_pitch); bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch); diff --git a/src/CudaDataOperationAlgorithm.cpp b/src/CudaDataOperationAlgorithm.cpp index 50b2faa..dcb6c3c 100644 --- a/src/CudaDataOperationAlgorithm.cpp +++ b/src/CudaDataOperationAlgorithm.cpp @@ -130,47 +130,40 @@ void CCudaDataOperationAlgorithm::run(int _iNrIterations) astraCUDA::setGPUIndex(m_iGPUIndex); + astraCUDA::SDimensions dims; + // We slightly abuse dims here: width/height is not necessarily a volume + dims.iVolWidth = m_pData[0]->getWidth(); + dims.iVolHeight = m_pData[0]->getHeight(); + if (m_sOperation == "$1*s1" || m_sOperation == "$1.*s1") // data * scalar { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); if (m_pMask == NULL) - astraCUDA::processVolCopy(m_pData[0]->getData(), m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_fScalar[0], dims); else - astraCUDA::processVolCopy(m_pData[0]->getData(), m_pMask->getDataConst(), m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_pMask->getDataConst(), m_fScalar[0], dims); } else if (m_sOperation == "$1/s1" || m_sOperation == "$1./s1") // data / scalar { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); if (m_pMask == NULL) - astraCUDA::processVolCopy(m_pData[0]->getData(), 1.0f/m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), 1.0f/m_fScalar[0], dims); else - astraCUDA::processVolCopy(m_pData[0]->getData(), m_pMask->getDataConst(), 1.0f/m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_pMask->getDataConst(), 1.0f/m_fScalar[0], dims); } else if (m_sOperation == "$1+s1") // data + scalar { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); - astraCUDA::processVolCopy(m_pData[0]->getData(), m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_fScalar[0], dims); } else if (m_sOperation == "$1-s1") // data - scalar { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); - astraCUDA::processVolCopy(m_pData[0]->getData(), -m_fScalar[0], width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), -m_fScalar[0], dims); } else if (m_sOperation == "$1.*$2") // data .* data { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); - astraCUDA::processVolCopy(m_pData[0]->getData(), m_pData[1]->getDataConst(), width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_pData[1]->getDataConst(), dims); } else if (m_sOperation == "$1+$2") // data + data { - unsigned int width = m_pData[0]->getWidth(); - unsigned int height = m_pData[0]->getHeight(); - astraCUDA::processVolCopy(m_pData[0]->getData(), m_pData[1]->getDataConst(), width, height); + astraCUDA::processVolCopy(m_pData[0]->getData(), m_pData[1]->getDataConst(), dims); } } -- cgit v1.2.3