summaryrefslogtreecommitdiffstats
path: root/cuda/3d
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/3d')
-rw-r--r--cuda/3d/algo3d.cu108
-rw-r--r--cuda/3d/algo3d.h68
-rw-r--r--cuda/3d/arith3d.cu610
-rw-r--r--cuda/3d/arith3d.h79
-rw-r--r--cuda/3d/astra3d.cu1620
-rw-r--r--cuda/3d/astra3d.h450
-rw-r--r--cuda/3d/cgls3d.cu428
-rw-r--r--cuda/3d/cgls3d.h114
-rw-r--r--cuda/3d/cone_bp.cu481
-rw-r--r--cuda/3d/cone_bp.h45
-rw-r--r--cuda/3d/cone_fp.cu513
-rw-r--r--cuda/3d/cone_fp.h46
-rw-r--r--cuda/3d/darthelper3d.cu229
-rw-r--r--cuda/3d/darthelper3d.h46
-rw-r--r--cuda/3d/dims3d.h84
-rw-r--r--cuda/3d/fdk.cu646
-rw-r--r--cuda/3d/fdk.h43
-rw-r--r--cuda/3d/par3d_bp.cu464
-rw-r--r--cuda/3d/par3d_bp.h45
-rw-r--r--cuda/3d/par3d_fp.cu814
-rw-r--r--cuda/3d/par3d_fp.h51
-rw-r--r--cuda/3d/sirt3d.cu533
-rw-r--r--cuda/3d/sirt3d.h118
-rw-r--r--cuda/3d/util3d.cu514
-rw-r--r--cuda/3d/util3d.h69
25 files changed, 8218 insertions, 0 deletions
diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu
new file mode 100644
index 0000000..20e7381
--- /dev/null
+++ b/cuda/3d/algo3d.cu
@@ -0,0 +1,108 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cassert>
+
+#include "algo3d.h"
+#include "cone_fp.h"
+#include "cone_bp.h"
+#include "par3d_fp.h"
+#include "par3d_bp.h"
+
+namespace astraCUDA3d {
+
+ReconAlgo3D::ReconAlgo3D()
+{
+ coneProjs = 0;
+ par3DProjs = 0;
+ shouldAbort = false;
+}
+
+ReconAlgo3D::~ReconAlgo3D()
+{
+ reset();
+}
+
+void ReconAlgo3D::reset()
+{
+ delete[] coneProjs;
+ coneProjs = 0;
+ delete[] par3DProjs;
+ par3DProjs = 0;
+ shouldAbort = false;
+}
+
+bool ReconAlgo3D::setConeGeometry(const SDimensions3D& _dims, const SConeProjection* _angles)
+{
+ dims = _dims;
+
+ coneProjs = new SConeProjection[dims.iProjAngles];
+ par3DProjs = 0;
+
+ memcpy(coneProjs, _angles, sizeof(coneProjs[0]) * dims.iProjAngles);
+
+ return true;
+}
+
+bool ReconAlgo3D::setPar3DGeometry(const SDimensions3D& _dims, const SPar3DProjection* _angles)
+{
+ dims = _dims;
+
+ par3DProjs = new SPar3DProjection[dims.iProjAngles];
+ coneProjs = 0;
+
+ memcpy(par3DProjs, _angles, sizeof(par3DProjs[0]) * dims.iProjAngles);
+
+ return true;
+}
+
+
+bool ReconAlgo3D::callFP(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData,
+ float outputScale)
+{
+ if (coneProjs) {
+ return ConeFP(D_volumeData, D_projData, dims, coneProjs, outputScale);
+ } else {
+ return Par3DFP(D_volumeData, D_projData, dims, par3DProjs, outputScale);
+ }
+}
+
+bool ReconAlgo3D::callBP(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData)
+{
+ if (coneProjs) {
+ return ConeBP(D_volumeData, D_projData, dims, coneProjs);
+ } else {
+ return Par3DBP(D_volumeData, D_projData, dims, par3DProjs);
+ }
+}
+
+
+
+}
diff --git a/cuda/3d/algo3d.h b/cuda/3d/algo3d.h
new file mode 100644
index 0000000..2b44f6f
--- /dev/null
+++ b/cuda/3d/algo3d.h
@@ -0,0 +1,68 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ALGO_H
+#define _CUDA_ALGO_H
+
+#include "dims3d.h"
+#include "util3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport ReconAlgo3D {
+public:
+ ReconAlgo3D();
+ ~ReconAlgo3D();
+
+ bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+ bool setPar3DGeometry(const SDimensions3D& dims, const SPar3DProjection* projs);
+
+ void signalAbort() { shouldAbort = true; }
+
+protected:
+ void reset();
+
+ bool callFP(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData,
+ float outputScale);
+ bool callBP(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData);
+
+ SDimensions3D dims;
+ SConeProjection* coneProjs;
+ SPar3DProjection* par3DProjs;
+
+ volatile bool shouldAbort;
+
+};
+
+
+}
+
+#endif
+
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
new file mode 100644
index 0000000..9a19be0
--- /dev/null
+++ b/cuda/3d/arith3d.cu
@@ -0,0 +1,610 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util3d.h"
+#include "arith3d.h"
+#include <cassert>
+
+namespace astraCUDA3d {
+
+struct opAddScaled {
+ __device__ void operator()(float& out, const float in, const float inp) {
+ out += in * inp;
+ }
+};
+struct opScaleAndAdd {
+ __device__ void operator()(float& out, const float in, const float inp) {
+ out = in + out * inp;
+ }
+};
+struct opAddMulScaled {
+ __device__ void operator()(float& out, const float in1, const float in2, const float inp) {
+ out += in1 * in2 * inp;
+ }
+};
+struct opAddMul {
+ __device__ void operator()(float& out, const float in1, const float in2) {
+ out += in1 * in2;
+ }
+};
+struct opMul {
+ __device__ void operator()(float& out, const float in) {
+ out *= in;
+ }
+};
+struct opMul2 {
+ __device__ void operator()(float& out, const float in1, const float in2) {
+ out *= in1 * in2;
+ }
+};
+struct opDividedBy {
+ __device__ void operator()(float& out, const float in) {
+ if (out > 0.000001f) // out is assumed to be positive
+ out = in / out;
+ else
+ out = 0.0f;
+ }
+};
+struct opInvert {
+ __device__ void operator()(float& out) {
+ if (out > 0.000001f) // out is assumed to be positive
+ out = 1 / out;
+ else
+ out = 0.0f;
+ }
+};
+struct opSet {
+ __device__ void operator()(float& out, const float inp) {
+ out = inp;
+ }
+};
+struct opClampMin {
+ __device__ void operator()(float& out, const float inp) {
+ if (out < inp)
+ out = inp;
+ }
+};
+struct opClampMax {
+ __device__ void operator()(float& out, const float inp) {
+ if (out > inp)
+ out = inp;
+ }
+};
+
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn1[off], pfIn2[off]);
+ off += pitch;
+ y++;
+ }
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ if (x >= width) return;
+
+ unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+ unsigned int off = (y+padY)*pitch+x+padX;
+ for (unsigned int i = 0; i < repeat; ++i) {
+ if (y >= height)
+ break;
+ op()(pfOut[off], pfIn1[off], pfIn2[off], fParam);
+ off += pitch;
+ y++;
+ }
+}
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+511)/512);
+
+ float *pfOut = (float*)out;
+
+ devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ float *pfOut = (float*)out;
+
+ devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ float *pfOut = (float*)out;
+ const float *pfIn = (const float*)in;
+
+ devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ float *pfOut = (float*)out;
+ const float *pfIn = (const float*)in;
+
+ devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ float *pfOut = (float*)out;
+ const float *pfIn1 = (const float*)in1;
+ const float *pfIn2 = (const float*)in2;
+
+ devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((width+15)/16, (height+15)/16);
+
+ float *pfOut = (float*)out;
+ const float *pfIn1 = (const float*)in1;
+ const float *pfIn2 = (const float*)in2;
+
+ devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+ for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+ devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn = (float*)in.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+ float *pfOut = (float*)out.ptr;
+ float *pfIn1 = (float*)in1.ptr;
+ float *pfIn2 = (float*)in2.ptr;
+ unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+ for (unsigned int i = 0; i < dims.iProjV; ++i) {
+ devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+ pfOut += step;
+ pfIn1 += step;
+ pfIn2 += step;
+ }
+
+ cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#define INST_DFtoD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+
+#define INST_DtoD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+
+#define INST_DDtoD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+#define INST_DDFtoD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+
+
+#define INST_toD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
+
+#define INST_FtoD(name) \
+ template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+ template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \
+ template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+
+
+
+INST_DFtoD(opAddScaled)
+INST_DFtoD(opScaleAndAdd)
+INST_DDFtoD(opAddMulScaled)
+INST_DDtoD(opAddMul)
+INST_DDtoD(opMul2)
+INST_DtoD(opMul)
+INST_DtoD(opDividedBy)
+INST_toD(opInvert)
+INST_FtoD(opSet)
+INST_FtoD(opClampMin)
+INST_FtoD(opClampMax)
+
+
+}
diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h
new file mode 100644
index 0000000..53c9b79
--- /dev/null
+++ b/cuda/3d/arith3d.h
@@ -0,0 +1,79 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH3D_H
+#define _CUDA_ARITH3D_H
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+struct opAddScaled;
+struct opScaleAndAdd;
+struct opAddMulScaled;
+struct opAddMul;
+struct opMul;
+struct opMul2;
+struct opDividedBy;
+struct opInvert;
+struct opSet;
+struct opClampMin;
+struct opClampMax;
+
+enum VolType {
+ SINO = 0,
+ VOL = 1
+};
+
+
+template<typename op, VolType t> void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height);
+
+template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+
+
+}
+
+#endif
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
new file mode 100644
index 0000000..fd4b370
--- /dev/null
+++ b/cuda/3d/astra3d.cu
@@ -0,0 +1,1620 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls3d.h"
+#include "sirt3d.h"
+#include "util3d.h"
+#include "cone_fp.h"
+#include "cone_bp.h"
+#include "par3d_fp.h"
+#include "par3d_bp.h"
+#include "fdk.h"
+#include "arith3d.h"
+#include "astra3d.h"
+
+#include <iostream>
+
+using namespace astraCUDA3d;
+
+namespace astra {
+
+enum CUDAProjectionType3d {
+ PROJ_PARALLEL,
+ PROJ_CONE
+};
+
+
+static SConeProjection* genConeProjections(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ double fOriginSourceDistance,
+ double fOriginDetectorDistance,
+ double fDetUSize,
+ double fDetVSize,
+ const float *pfAngles)
+{
+ SConeProjection base;
+ base.fSrcX = 0.0f;
+ base.fSrcY = -fOriginSourceDistance;
+ base.fSrcZ = 0.0f;
+
+ base.fDetSX = iProjU * fDetUSize * -0.5f;
+ base.fDetSY = fOriginDetectorDistance;
+ base.fDetSZ = iProjV * fDetVSize * -0.5f;
+
+ base.fDetUX = fDetUSize;
+ base.fDetUY = 0.0f;
+ base.fDetUZ = 0.0f;
+
+ base.fDetVX = 0.0f;
+ base.fDetVY = 0.0f;
+ base.fDetVZ = fDetVSize;
+
+ SConeProjection* p = new SConeProjection[iProjAngles];
+
+#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0)
+
+ for (unsigned int i = 0; i < iProjAngles; ++i) {
+ ROTATE0(Src, i, pfAngles[i]);
+ ROTATE0(DetS, i, pfAngles[i]);
+ ROTATE0(DetU, i, pfAngles[i]);
+ ROTATE0(DetV, i, pfAngles[i]);
+ }
+
+#undef ROTATE0
+
+ return p;
+}
+
+static SPar3DProjection* genPar3DProjections(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ double fDetUSize,
+ double fDetVSize,
+ const float *pfAngles)
+{
+ SPar3DProjection base;
+ base.fRayX = 0.0f;
+ base.fRayY = 1.0f;
+ base.fRayZ = 0.0f;
+
+ base.fDetSX = iProjU * fDetUSize * -0.5f;
+ base.fDetSY = 0.0f;
+ base.fDetSZ = iProjV * fDetVSize * -0.5f;
+
+ base.fDetUX = fDetUSize;
+ base.fDetUY = 0.0f;
+ base.fDetUZ = 0.0f;
+
+ base.fDetVX = 0.0f;
+ base.fDetVY = 0.0f;
+ base.fDetVZ = fDetVSize;
+
+ SPar3DProjection* p = new SPar3DProjection[iProjAngles];
+
+#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0)
+
+ for (unsigned int i = 0; i < iProjAngles; ++i) {
+ ROTATE0(Ray, i, pfAngles[i]);
+ ROTATE0(DetS, i, pfAngles[i]);
+ ROTATE0(DetU, i, pfAngles[i]);
+ ROTATE0(DetV, i, pfAngles[i]);
+ }
+
+#undef ROTATE0
+
+ return p;
+}
+
+
+
+
+class AstraSIRT3d_internal {
+public:
+ SDimensions3D dims;
+ CUDAProjectionType3d projType;
+
+ float* angles;
+ float fOriginSourceDistance;
+ float fOriginDetectorDistance;
+ float fSourceZ;
+ float fDetSize;
+
+ SConeProjection* projs;
+ SPar3DProjection* parprojs;
+
+ float fPixelSize;
+
+ bool initialized;
+ bool setStartReconstruction;
+
+ bool useVolumeMask;
+ bool useSinogramMask;
+
+ // Input/output
+ cudaPitchedPtr D_projData;
+ cudaPitchedPtr D_volumeData;
+ cudaPitchedPtr D_maskData;
+ cudaPitchedPtr D_smaskData;
+
+ SIRT sirt;
+};
+
+AstraSIRT3d::AstraSIRT3d()
+{
+ pData = new AstraSIRT3d_internal();
+
+ pData->angles = 0;
+ pData->D_projData.ptr = 0;
+ pData->D_volumeData.ptr = 0;
+ pData->D_maskData.ptr = 0;
+ pData->D_smaskData.ptr = 0;
+
+ pData->dims.iVolX = 0;
+ pData->dims.iVolY = 0;
+ pData->dims.iVolZ = 0;
+ pData->dims.iProjAngles = 0;
+ pData->dims.iProjU = 0;
+ pData->dims.iProjV = 0;
+ pData->dims.iRaysPerDetDim = 1;
+ pData->dims.iRaysPerVoxelDim = 1;
+
+ pData->projs = 0;
+
+ pData->initialized = false;
+ pData->setStartReconstruction = false;
+
+ pData->useVolumeMask = false;
+ pData->useSinogramMask = false;
+}
+
+AstraSIRT3d::~AstraSIRT3d()
+{
+ delete[] pData->angles;
+ pData->angles = 0;
+
+ delete[] pData->projs;
+ pData->projs = 0;
+
+ cudaFree(pData->D_projData.ptr);
+ pData->D_projData.ptr = 0;
+
+ cudaFree(pData->D_volumeData.ptr);
+ pData->D_volumeData.ptr = 0;
+
+ cudaFree(pData->D_maskData.ptr);
+ pData->D_maskData.ptr = 0;
+
+ cudaFree(pData->D_smaskData.ptr);
+ pData->D_smaskData.ptr = 0;
+
+ delete pData;
+ pData = 0;
+}
+
+bool AstraSIRT3d::setReconstructionGeometry(unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ/*,
+ float fPixelSize = 1.0f*/)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iVolX = iVolX;
+ pData->dims.iVolY = iVolY;
+ pData->dims.iVolZ = iVolZ;
+
+ return (iVolX > 0 && iVolY > 0 && iVolZ > 0);
+}
+
+
+bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection* projs)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+ return false;
+
+ pData->parprojs = new SPar3DProjection[iProjAngles];
+ memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0]));
+
+ pData->projType = PROJ_PARALLEL;
+
+ return true;
+}
+
+bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SPar3DProjection* p = genPar3DProjections(iProjAngles,
+ iProjU, iProjV,
+ fDetUSize, fDetVSize,
+ pfAngles);
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ pData->parprojs = p;
+ pData->projType = PROJ_PARALLEL;
+
+ return true;
+}
+
+
+
+bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection* projs)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+ return false;
+
+ pData->projs = new SConeProjection[iProjAngles];
+ memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0]));
+
+ pData->projType = PROJ_CONE;
+
+ return true;
+}
+
+bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SConeProjection* p = genConeProjections(iProjAngles,
+ iProjU, iProjV,
+ fOriginSourceDistance,
+ fOriginDetectorDistance,
+ fDetUSize, fDetVSize,
+ pfAngles);
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ pData->projs = p;
+ pData->projType = PROJ_CONE;
+
+ return true;
+}
+
+bool AstraSIRT3d::enableSuperSampling(unsigned int iVoxelSuperSampling,
+ unsigned int iDetectorSuperSampling)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0)
+ return false;
+
+ pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+ pData->dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+ return true;
+}
+
+bool AstraSIRT3d::enableVolumeMask()
+{
+ if (pData->initialized)
+ return false;
+
+ bool ok = pData->sirt.enableVolumeMask();
+ pData->useVolumeMask = ok;
+
+ return ok;
+}
+
+bool AstraSIRT3d::enableSinogramMask()
+{
+ if (pData->initialized)
+ return false;
+
+ bool ok = pData->sirt.enableSinogramMask();
+ pData->useSinogramMask = ok;
+
+ return ok;
+}
+
+bool AstraSIRT3d::setGPUIndex(int index)
+{
+ cudaSetDevice(index);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+ return true;
+}
+
+bool AstraSIRT3d::init()
+{
+ fprintf(stderr, "001: %d\n", true);
+ if (pData->initialized)
+ return false;
+ fprintf(stderr, "002: %d\n", true);
+
+ if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0)
+ return false;
+ fprintf(stderr, "003: %d\n", true);
+
+ bool ok;
+
+ if (pData->projType == PROJ_PARALLEL) {
+ ok = pData->sirt.setPar3DGeometry(pData->dims, pData->parprojs);
+ } else {
+ ok = pData->sirt.setConeGeometry(pData->dims, pData->projs);
+ }
+ fprintf(stderr, "004: %d\n", ok);
+
+ if (!ok)
+ return false;
+
+ ok = pData->sirt.init();
+ if (!ok)
+ return false;
+ fprintf(stderr, "005: %d\n", ok);
+
+ pData->D_volumeData = allocateVolumeData(pData->dims);
+ ok = pData->D_volumeData.ptr;
+ if (!ok)
+ return false;
+ fprintf(stderr, "006: %d\n", ok);
+
+ fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV);
+ pData->D_projData = allocateProjectionData(pData->dims);
+ ok = pData->D_projData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ pData->D_volumeData.ptr = 0;
+ return false;
+ }
+ fprintf(stderr, "007: %d\n", ok);
+
+ if (pData->useVolumeMask) {
+ pData->D_maskData = allocateVolumeData(pData->dims);
+ ok = pData->D_maskData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ cudaFree(pData->D_projData.ptr);
+ pData->D_volumeData.ptr = 0;
+ pData->D_projData.ptr = 0;
+ return false;
+ }
+ }
+
+ if (pData->useSinogramMask) {
+ pData->D_smaskData = allocateProjectionData(pData->dims);
+ ok = pData->D_smaskData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ cudaFree(pData->D_projData.ptr);
+ cudaFree(pData->D_maskData.ptr);
+ pData->D_volumeData.ptr = 0;
+ pData->D_projData.ptr = 0;
+ pData->D_maskData.ptr = 0;
+ return false;
+ }
+ }
+ fprintf(stderr, "008: %d\n", ok);
+
+ pData->initialized = true;
+
+ return true;
+}
+
+bool AstraSIRT3d::setMinConstraint(float fMin)
+{
+ if (!pData->initialized)
+ return false;
+ return pData->sirt.setMinConstraint(fMin);
+}
+
+bool AstraSIRT3d::setMaxConstraint(float fMax)
+{
+ if (!pData->initialized)
+ return false;
+ return pData->sirt.setMaxConstraint(fMax);
+}
+
+bool AstraSIRT3d::setSinogram(const float* pfSinogram,
+ unsigned int iSinogramPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pfSinogram)
+ return false;
+
+ bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch);
+
+ if (!ok)
+ return false;
+
+ ok = pData->sirt.setBuffers(pData->D_volumeData, pData->D_projData);
+ if (!ok)
+ return false;
+
+ pData->setStartReconstruction = false;
+
+ return true;
+}
+
+bool AstraSIRT3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pData->useVolumeMask)
+ return false;
+ if (!pfMask)
+ return false;
+
+ bool ok = copyVolumeToDevice(pfMask, pData->D_maskData,
+ pData->dims, iMaskPitch);
+ if (!ok)
+ return false;
+
+ ok = pData->sirt.setVolumeMask(pData->D_maskData);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+bool AstraSIRT3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pData->useSinogramMask)
+ return false;
+ if (!pfMask)
+ return false;
+
+ bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch);
+
+ if (!ok)
+ return false;
+
+ ok = pData->sirt.setSinogramMask(pData->D_smaskData);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+bool AstraSIRT3d::setStartReconstruction(const float* pfReconstruction,
+ unsigned int iReconstructionPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pfReconstruction)
+ return false;
+
+ bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData,
+ pData->dims, iReconstructionPitch);
+ if (!ok)
+ return false;
+
+ pData->setStartReconstruction = true;
+
+ return true;
+}
+
+bool AstraSIRT3d::iterate(unsigned int iIterations)
+{
+ if (!pData->initialized)
+ return false;
+
+ if (!pData->setStartReconstruction)
+ zeroVolumeData(pData->D_volumeData, pData->dims);
+
+ bool ok = pData->sirt.iterate(iIterations);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+bool AstraSIRT3d::getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const
+{
+ if (!pData->initialized)
+ return false;
+
+ bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData,
+ pData->dims, iReconstructionPitch);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+void AstraSIRT3d::signalAbort()
+{
+ if (!pData->initialized)
+ return;
+
+ pData->sirt.signalAbort();
+}
+
+float AstraSIRT3d::computeDiffNorm()
+{
+ if (!pData->initialized)
+ return 0.0f; // FIXME: Error?
+
+ return pData->sirt.computeDiffNorm();
+}
+
+
+
+
+class AstraCGLS3d_internal {
+public:
+ SDimensions3D dims;
+ CUDAProjectionType3d projType;
+
+ float* angles;
+ float fOriginSourceDistance;
+ float fOriginDetectorDistance;
+ float fSourceZ;
+ float fDetSize;
+
+ SConeProjection* projs;
+ SPar3DProjection* parprojs;
+
+ float fPixelSize;
+
+ bool initialized;
+ bool setStartReconstruction;
+
+ bool useVolumeMask;
+ bool useSinogramMask;
+
+ // Input/output
+ cudaPitchedPtr D_projData;
+ cudaPitchedPtr D_volumeData;
+ cudaPitchedPtr D_maskData;
+ cudaPitchedPtr D_smaskData;
+
+ CGLS cgls;
+};
+
+AstraCGLS3d::AstraCGLS3d()
+{
+ pData = new AstraCGLS3d_internal();
+
+ pData->angles = 0;
+ pData->D_projData.ptr = 0;
+ pData->D_volumeData.ptr = 0;
+ pData->D_maskData.ptr = 0;
+ pData->D_smaskData.ptr = 0;
+
+ pData->dims.iVolX = 0;
+ pData->dims.iVolY = 0;
+ pData->dims.iVolZ = 0;
+ pData->dims.iProjAngles = 0;
+ pData->dims.iProjU = 0;
+ pData->dims.iProjV = 0;
+ pData->dims.iRaysPerDetDim = 1;
+ pData->dims.iRaysPerVoxelDim = 1;
+
+ pData->projs = 0;
+
+ pData->initialized = false;
+ pData->setStartReconstruction = false;
+
+ pData->useVolumeMask = false;
+ pData->useSinogramMask = false;
+}
+
+AstraCGLS3d::~AstraCGLS3d()
+{
+ delete[] pData->angles;
+ pData->angles = 0;
+
+ delete[] pData->projs;
+ pData->projs = 0;
+
+ cudaFree(pData->D_projData.ptr);
+ pData->D_projData.ptr = 0;
+
+ cudaFree(pData->D_volumeData.ptr);
+ pData->D_volumeData.ptr = 0;
+
+ cudaFree(pData->D_maskData.ptr);
+ pData->D_maskData.ptr = 0;
+
+ cudaFree(pData->D_smaskData.ptr);
+ pData->D_smaskData.ptr = 0;
+
+ delete pData;
+ pData = 0;
+}
+
+bool AstraCGLS3d::setReconstructionGeometry(unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ/*,
+ float fPixelSize = 1.0f*/)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iVolX = iVolX;
+ pData->dims.iVolY = iVolY;
+ pData->dims.iVolZ = iVolZ;
+
+ return (iVolX > 0 && iVolY > 0 && iVolZ > 0);
+}
+
+
+bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection* projs)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+ return false;
+
+ pData->parprojs = new SPar3DProjection[iProjAngles];
+ memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0]));
+
+ pData->projType = PROJ_PARALLEL;
+
+ return true;
+}
+
+bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SPar3DProjection* p = genPar3DProjections(iProjAngles,
+ iProjU, iProjV,
+ fDetUSize, fDetVSize,
+ pfAngles);
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ pData->parprojs = p;
+ pData->projType = PROJ_PARALLEL;
+
+ return true;
+}
+
+
+
+bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection* projs)
+{
+ if (pData->initialized)
+ return false;
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+ return false;
+
+ pData->projs = new SConeProjection[iProjAngles];
+ memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0]));
+
+ pData->projType = PROJ_CONE;
+
+ return true;
+}
+
+bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SConeProjection* p = genConeProjections(iProjAngles,
+ iProjU, iProjV,
+ fOriginSourceDistance,
+ fOriginDetectorDistance,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ pData->dims.iProjAngles = iProjAngles;
+ pData->dims.iProjU = iProjU;
+ pData->dims.iProjV = iProjV;
+
+ pData->projs = p;
+ pData->projType = PROJ_CONE;
+
+ return true;
+}
+
+bool AstraCGLS3d::enableSuperSampling(unsigned int iVoxelSuperSampling,
+ unsigned int iDetectorSuperSampling)
+{
+ if (pData->initialized)
+ return false;
+
+ if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0)
+ return false;
+
+ pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+ pData->dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+ return true;
+}
+
+bool AstraCGLS3d::enableVolumeMask()
+{
+ if (pData->initialized)
+ return false;
+
+ bool ok = pData->cgls.enableVolumeMask();
+ pData->useVolumeMask = ok;
+
+ return ok;
+}
+
+#if 0
+bool AstraCGLS3d::enableSinogramMask()
+{
+ if (pData->initialized)
+ return false;
+
+ bool ok = pData->cgls.enableSinogramMask();
+ pData->useSinogramMask = ok;
+
+ return ok;
+}
+#endif
+
+bool AstraCGLS3d::setGPUIndex(int index)
+{
+ cudaSetDevice(index);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+ return true;
+}
+
+bool AstraCGLS3d::init()
+{
+ fprintf(stderr, "001: %d\n", true);
+ if (pData->initialized)
+ return false;
+ fprintf(stderr, "002: %d\n", true);
+
+ if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0)
+ return false;
+ fprintf(stderr, "003: %d\n", true);
+
+ bool ok;
+
+ if (pData->projType == PROJ_PARALLEL) {
+ ok = pData->cgls.setPar3DGeometry(pData->dims, pData->parprojs);
+ } else {
+ ok = pData->cgls.setConeGeometry(pData->dims, pData->projs);
+ }
+ fprintf(stderr, "004: %d\n", ok);
+
+ if (!ok)
+ return false;
+
+ ok = pData->cgls.init();
+ if (!ok)
+ return false;
+ fprintf(stderr, "005: %d\n", ok);
+
+ pData->D_volumeData = allocateVolumeData(pData->dims);
+ ok = pData->D_volumeData.ptr;
+ if (!ok)
+ return false;
+ fprintf(stderr, "006: %d\n", ok);
+
+ fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV);
+ pData->D_projData = allocateProjectionData(pData->dims);
+ ok = pData->D_projData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ pData->D_volumeData.ptr = 0;
+ return false;
+ }
+ fprintf(stderr, "007: %d\n", ok);
+
+ if (pData->useVolumeMask) {
+ pData->D_maskData = allocateVolumeData(pData->dims);
+ ok = pData->D_maskData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ cudaFree(pData->D_projData.ptr);
+ pData->D_volumeData.ptr = 0;
+ pData->D_projData.ptr = 0;
+ return false;
+ }
+ }
+
+ if (pData->useSinogramMask) {
+ pData->D_smaskData = allocateProjectionData(pData->dims);
+ ok = pData->D_smaskData.ptr;
+ if (!ok) {
+ cudaFree(pData->D_volumeData.ptr);
+ cudaFree(pData->D_projData.ptr);
+ cudaFree(pData->D_maskData.ptr);
+ pData->D_volumeData.ptr = 0;
+ pData->D_projData.ptr = 0;
+ pData->D_maskData.ptr = 0;
+ return false;
+ }
+ }
+ fprintf(stderr, "008: %d\n", ok);
+
+ pData->initialized = true;
+
+ return true;
+}
+
+#if 0
+bool AstraCGLS3d::setMinConstraint(float fMin)
+{
+ if (!pData->initialized)
+ return false;
+ return pData->cgls.setMinConstraint(fMin);
+}
+
+bool AstraCGLS3d::setMaxConstraint(float fMax)
+{
+ if (!pData->initialized)
+ return false;
+ return pData->cgls.setMaxConstraint(fMax);
+}
+#endif
+
+bool AstraCGLS3d::setSinogram(const float* pfSinogram,
+ unsigned int iSinogramPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pfSinogram)
+ return false;
+
+ bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch);
+
+ if (!ok)
+ return false;
+
+ ok = pData->cgls.setBuffers(pData->D_volumeData, pData->D_projData);
+ if (!ok)
+ return false;
+
+ pData->setStartReconstruction = false;
+
+ return true;
+}
+
+bool AstraCGLS3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pData->useVolumeMask)
+ return false;
+ if (!pfMask)
+ return false;
+
+ bool ok = copyVolumeToDevice(pfMask, pData->D_maskData,
+ pData->dims, iMaskPitch);
+ if (!ok)
+ return false;
+
+ ok = pData->cgls.setVolumeMask(pData->D_maskData);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+#if 0
+bool AstraCGLS3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pData->useSinogramMask)
+ return false;
+ if (!pfMask)
+ return false;
+
+ bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch);
+
+ if (!ok)
+ return false;
+
+ ok = pData->cgls.setSinogramMask(pData->D_smaskData);
+ if (!ok)
+ return false;
+
+ return true;
+}
+#endif
+
+bool AstraCGLS3d::setStartReconstruction(const float* pfReconstruction,
+ unsigned int iReconstructionPitch)
+{
+ if (!pData->initialized)
+ return false;
+ if (!pfReconstruction)
+ return false;
+
+ bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData,
+ pData->dims, iReconstructionPitch);
+ if (!ok)
+ return false;
+
+ pData->setStartReconstruction = true;
+
+ return true;
+}
+
+bool AstraCGLS3d::iterate(unsigned int iIterations)
+{
+ if (!pData->initialized)
+ return false;
+
+ if (!pData->setStartReconstruction)
+ zeroVolumeData(pData->D_volumeData, pData->dims);
+
+ bool ok = pData->cgls.iterate(iIterations);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+bool AstraCGLS3d::getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const
+{
+ if (!pData->initialized)
+ return false;
+
+ bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData,
+ pData->dims, iReconstructionPitch);
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+void AstraCGLS3d::signalAbort()
+{
+ if (!pData->initialized)
+ return;
+
+ pData->cgls.signalAbort();
+}
+
+float AstraCGLS3d::computeDiffNorm()
+{
+ if (!pData->initialized)
+ return 0.0f; // FIXME: Error?
+
+ return pData->cgls.computeDiffNorm();
+}
+
+
+
+bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling)
+{
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SConeProjection* p = genConeProjections(iProjAngles,
+ iProjU, iProjV,
+ fOriginSourceDistance,
+ fOriginDetectorDistance,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ bool ok;
+ ok = astraCudaConeFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+ iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling);
+
+ delete[] p;
+
+ return ok;
+}
+
+bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+ if (iDetectorSuperSampling == 0)
+ return false;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ bool ok = D_volumeData.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+ ok &= zeroProjectionData(D_projData, dims);
+
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ ok &= ConeFP(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+
+ ok &= copyProjectionsFromDevice(pfProjections, D_projData,
+ dims, dims.iProjU);
+
+
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling,
+ Cuda3DProjectionKernel projKernel)
+{
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SPar3DProjection* p = genPar3DProjections(iProjAngles,
+ iProjU, iProjV,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ bool ok;
+ ok = astraCudaPar3DFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+ iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling,
+ projKernel);
+
+ delete[] p;
+
+ return ok;
+}
+
+
+bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling,
+ Cuda3DProjectionKernel projKernel)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+ if (iDetectorSuperSampling == 0)
+ return false;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ bool ok = D_volumeData.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+ ok &= zeroProjectionData(D_projData, dims);
+
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ switch (projKernel) {
+ case ker3d_default:
+ ok &= Par3DFP(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+ break;
+ case ker3d_sum_square_weights:
+ ok &= Par3DFP_SumSqW(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+ break;
+ default:
+ assert(false);
+ }
+
+ ok &= copyProjectionsFromDevice(pfProjections, D_projData,
+ dims, dims.iProjU);
+
+
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SConeProjection* p = genConeProjections(iProjAngles,
+ iProjU, iProjV,
+ fOriginSourceDistance,
+ fOriginDetectorDistance,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ bool ok;
+ ok = astraCudaConeBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+ iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling);
+
+ delete[] p;
+
+ return ok;
+}
+
+bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ bool ok = D_volumeData.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ ok &= copyProjectionsToDevice(pfProjections, D_projData,
+ dims, dims.iProjU);
+
+ ok &= zeroVolumeData(D_volumeData, dims);
+
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ ok &= ConeBP(D_volumeData, D_projData, dims, pfAngles);
+
+ ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SPar3DProjection* p = genPar3DProjections(iProjAngles,
+ iProjU, iProjV,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ bool ok;
+ ok = astraCudaPar3DBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+ iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling);
+
+ delete[] p;
+
+ return ok;
+}
+
+
+bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ bool ok = D_volumeData.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ ok &= copyProjectionsToDevice(pfProjections, D_projData,
+ dims, dims.iProjU);
+
+ ok &= zeroVolumeData(D_volumeData, dims);
+
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles);
+
+ ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+
+
+bool astraCudaFDK(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ bool bShortScan,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+ if (iVoxelSuperSampling == 0)
+ return false;
+
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ bool ok = D_volumeData.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ ok &= copyProjectionsToDevice(pfProjections, D_projData, dims, dims.iProjU);
+
+ ok &= zeroVolumeData(D_volumeData, dims);
+
+ if (!ok) {
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ // TODO: Offer interface for SrcZ, DetZ
+ ok &= FDK(D_volumeData, D_projData, fOriginSourceDistance,
+ fOriginDetectorDistance, 0, 0, fDetUSize, fDetVSize,
+ dims, pfAngles, bShortScan);
+
+ ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+
+
+
+}
diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h
new file mode 100644
index 0000000..5712f89
--- /dev/null
+++ b/cuda/3d/astra3d.h
@@ -0,0 +1,450 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ASTRA3D_H
+#define _CUDA_ASTRA3D_H
+
+#include "dims3d.h"
+
+namespace astra {
+
+
+// TODO: Switch to a class hierarchy as with the 2D algorithms
+
+
+enum Cuda3DProjectionKernel {
+ ker3d_default = 0,
+ ker3d_sum_square_weights
+};
+
+
+class AstraSIRT3d_internal;
+
+
+class _AstraExport AstraSIRT3d {
+public:
+
+ AstraSIRT3d();
+ ~AstraSIRT3d();
+
+ // Set the number of pixels in the reconstruction rectangle,
+ // and the length of the edge of a pixel.
+ // Volume pixels are assumed to be square.
+ // This must be called before setting the projection geometry.
+ bool setReconstructionGeometry(unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ/*,
+ float fPixelSize = 1.0f*/);
+
+ bool setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection* projs);
+ bool setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fSourceZ,
+ float fDetSize,
+ const float *pfAngles);
+ bool setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection* projs);
+ bool setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fSourceZ,
+ float fDetSize,
+ const float *pfAngles);
+
+ // Enable supersampling.
+ //
+ // The number of rays used in FP is the square of iDetectorSuperSampling.
+ // The number of rays used in BP is the cube of iVoxelSuperSampling.
+ bool enableSuperSampling(unsigned int iVoxelSuperSampling,
+ unsigned int iDetectorSuperSampling);
+
+ // Enable volume/sinogram masks
+ //
+ // This may optionally be called before init().
+ // If it is called, setVolumeMask()/setSinogramMask() must be called between
+ // setSinogram() and iterate().
+ bool enableVolumeMask();
+ bool enableSinogramMask();
+
+ // Set GPU index
+ //
+ // This should be called before init(). Note that setting the GPU index
+ // in a thread which has already used the GPU may not work.
+ bool setGPUIndex(int index);
+
+ // Allocate GPU buffers and
+ // precompute geometry-specific data.
+ //
+ // This must be called after calling setReconstructionGeometry() and
+ // setProjectionGeometry() or setFanProjectionGeometry().
+ bool init();
+
+ // Setup input sinogram for a slice.
+ // pfSinogram must be a float array of size XXX
+ // NB: iSinogramPitch is measured in floats, not in bytes.
+ //
+ // This must be called after init(), and before iterate(). It may be
+ // called again after iterate()/getReconstruction() to start a new slice.
+ //
+ // pfSinogram will only be read from during this call.
+ bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+ // Setup volume mask for a slice.
+ // pfMask must be a float array of size XXX
+ // NB: iMaskPitch is measured in floats, not in bytes.
+ //
+ // It may only contain the exact values 0.0f and 1.0f. Only volume pixels
+ // for which pfMask[z] is 1.0f are processed.
+ bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch);
+
+ // Setup sinogram mask for a slice.
+ // pfMask must be a float array of size XXX
+ // NB: iMaskPitch is measured in floats, not in bytes.
+ //
+ // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels
+ // for which pfMask[z] is 1.0f are processed.
+ bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch);
+
+ // Set the starting reconstruction for SIRT.
+ // pfReconstruction must be a float array of size XXX
+ // NB: iReconstructionPitch is measured in floats, not in bytes.
+ //
+ // This may be called between setSinogram() and iterate().
+ // If this function is not called before iterate(), SIRT will start
+ // from a zero reconstruction.
+ //
+ // pfReconstruction will only be read from during this call.
+ bool setStartReconstruction(const float* pfReconstruction,
+ unsigned int iReconstructionPitch);
+
+ // Enable min/max constraint.
+ //
+ // These may optionally be called between init() and iterate()
+ bool setMinConstraint(float fMin);
+ bool setMaxConstraint(float fMax);
+
+ // Perform a number of (additive) SIRT iterations.
+ // This must be called after setSinogram().
+ //
+ // If called multiple times, without calls to setSinogram() or
+ // setStartReconstruction() in between, iterate() will continue from
+ // the result of the previous call.
+ // Calls to getReconstruction() are allowed between calls to iterate() and
+ // do not change the state.
+ bool iterate(unsigned int iIterations);
+
+ // Get the reconstructed slice.
+ // pfReconstruction must be a float array of size XXX
+ // NB: iReconstructionPitch is measured in floats, not in bytes.
+ //
+ // This may be called after iterate().
+ bool getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const;
+
+ // Compute the norm of the difference of the FP of the current
+ // reconstruction and the sinogram. (This performs one FP.)
+ // It can be called after iterate().
+ float computeDiffNorm();
+
+ // Signal the algorithm that it should abort after the current iteration.
+ // This is intended to be called from another thread.
+ void signalAbort();
+
+protected:
+ AstraSIRT3d_internal *pData;
+};
+
+
+class AstraCGLS3d_internal;
+
+
+class _AstraExport AstraCGLS3d {
+public:
+
+ AstraCGLS3d();
+ ~AstraCGLS3d();
+
+ // Set the number of pixels in the reconstruction rectangle,
+ // and the length of the edge of a pixel.
+ // Volume pixels are assumed to be square.
+ // This must be called before setting the projection geometry.
+ bool setReconstructionGeometry(unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ/*,
+ float fPixelSize = 1.0f*/);
+
+ bool setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection* projs);
+ bool setConeGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fSourceZ,
+ float fDetSize,
+ const float *pfAngles);
+ bool setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection* projs);
+ bool setPar3DGeometry(unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fSourceZ,
+ float fDetSize,
+ const float *pfAngles);
+
+ // Enable supersampling.
+ //
+ // The number of rays used in FP is the square of iDetectorSuperSampling.
+ // The number of rays used in BP is the cube of iVoxelSuperSampling.
+ bool enableSuperSampling(unsigned int iVoxelSuperSampling,
+ unsigned int iDetectorSuperSampling);
+
+ // Enable volume/sinogram masks
+ //
+ // This may optionally be called before init().
+ // If it is called, setVolumeMask()/setSinogramMask() must be called between
+ // setSinogram() and iterate().
+ bool enableVolumeMask();
+ //bool enableSinogramMask();
+
+ // Set GPU index
+ //
+ // This should be called before init(). Note that setting the GPU index
+ // in a thread which has already used the GPU may not work.
+ bool setGPUIndex(int index);
+
+ // Allocate GPU buffers and
+ // precompute geometry-specific data.
+ //
+ // This must be called after calling setReconstructionGeometry() and
+ // setProjectionGeometry() or setFanProjectionGeometry().
+ bool init();
+
+ // Setup input sinogram for a slice.
+ // pfSinogram must be a float array of size XXX
+ // NB: iSinogramPitch is measured in floats, not in bytes.
+ //
+ // This must be called after init(), and before iterate(). It may be
+ // called again after iterate()/getReconstruction() to start a new slice.
+ //
+ // pfSinogram will only be read from during this call.
+ bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+ // Setup volume mask for a slice.
+ // pfMask must be a float array of size XXX
+ // NB: iMaskPitch is measured in floats, not in bytes.
+ //
+ // It may only contain the exact values 0.0f and 1.0f. Only volume pixels
+ // for which pfMask[z] is 1.0f are processed.
+ bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch);
+
+ // Setup sinogram mask for a slice.
+ // pfMask must be a float array of size XXX
+ // NB: iMaskPitch is measured in floats, not in bytes.
+ //
+ // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels
+ // for which pfMask[z] is 1.0f are processed.
+ //bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch);
+
+ // Set the starting reconstruction for SIRT.
+ // pfReconstruction must be a float array of size XXX
+ // NB: iReconstructionPitch is measured in floats, not in bytes.
+ //
+ // This may be called between setSinogram() and iterate().
+ // If this function is not called before iterate(), SIRT will start
+ // from a zero reconstruction.
+ //
+ // pfReconstruction will only be read from during this call.
+ bool setStartReconstruction(const float* pfReconstruction,
+ unsigned int iReconstructionPitch);
+
+ // Enable min/max constraint.
+ //
+ // These may optionally be called between init() and iterate()
+ //bool setMinConstraint(float fMin);
+ //bool setMaxConstraint(float fMax);
+
+ // Perform a number of (additive) SIRT iterations.
+ // This must be called after setSinogram().
+ //
+ // If called multiple times, without calls to setSinogram() or
+ // setStartReconstruction() in between, iterate() will continue from
+ // the result of the previous call.
+ // Calls to getReconstruction() are allowed between calls to iterate() and
+ // do not change the state.
+ bool iterate(unsigned int iIterations);
+
+ // Get the reconstructed slice.
+ // pfReconstruction must be a float array of size XXX
+ // NB: iReconstructionPitch is measured in floats, not in bytes.
+ //
+ // This may be called after iterate().
+ bool getReconstruction(float* pfReconstruction,
+ unsigned int iReconstructionPitch) const;
+
+ // Compute the norm of the difference of the FP of the current
+ // reconstruction and the sinogram. (This performs one FP.)
+ // It can be called after iterate().
+ float computeDiffNorm();
+
+ // Signal the algorithm that it should abort after the current iteration.
+ // This is intended to be called from another thread.
+ void signalAbort();
+
+protected:
+ AstraCGLS3d_internal *pData;
+};
+
+
+
+_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling);
+
+_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling);
+
+_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling,
+ Cuda3DProjectionKernel projKernel);
+
+_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iDetectorSuperSampling,
+ Cuda3DProjectionKernel projKernel);
+
+
+_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SConeProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fOriginSourceDistance,
+ float fOriginDetectorDistance,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ bool bShortScan,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+}
+
+
+#endif
diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu
new file mode 100644
index 0000000..72bb9cd
--- /dev/null
+++ b/cuda/3d/cgls3d.cu
@@ -0,0 +1,428 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls3d.h"
+#include "util3d.h"
+#include "arith3d.h"
+#include "cone_fp.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA3d {
+
+CGLS::CGLS() : ReconAlgo3D()
+{
+ D_maskData.ptr = 0;
+ D_smaskData.ptr = 0;
+
+ D_sinoData.ptr = 0;
+ D_volumeData.ptr = 0;
+
+ D_r.ptr = 0;
+ D_w.ptr = 0;
+ D_z.ptr = 0;
+ D_p.ptr = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+}
+
+
+CGLS::~CGLS()
+{
+ reset();
+}
+
+void CGLS::reset()
+{
+ cudaFree(D_r.ptr);
+ cudaFree(D_w.ptr);
+ cudaFree(D_z.ptr);
+ cudaFree(D_p.ptr);
+
+ D_maskData.ptr = 0;
+ D_smaskData.ptr = 0;
+
+ D_sinoData.ptr = 0;
+ D_volumeData.ptr = 0;
+
+ D_r.ptr = 0;
+ D_w.ptr = 0;
+ D_z.ptr = 0;
+ D_p.ptr = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ sliceInitialized = false;
+
+ ReconAlgo3D::reset();
+}
+
+bool CGLS::enableVolumeMask()
+{
+ useVolumeMask = true;
+ return true;
+}
+
+bool CGLS::enableSinogramMask()
+{
+ useSinogramMask = true;
+ return true;
+}
+
+
+bool CGLS::init()
+{
+ D_z = allocateVolumeData(dims);
+ D_p = allocateVolumeData(dims);
+ D_r = allocateProjectionData(dims);
+ D_w = allocateProjectionData(dims);
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+bool CGLS::setVolumeMask(cudaPitchedPtr& _D_maskData)
+{
+ assert(useVolumeMask);
+
+ D_maskData = _D_maskData;
+
+ return true;
+}
+
+bool CGLS::setSinogramMask(cudaPitchedPtr& _D_smaskData)
+{
+ return false;
+#if 0
+ // TODO: Implement this
+ assert(useSinogramMask);
+
+ D_smaskData = _D_smaskData;
+ return true;
+#endif
+}
+
+bool CGLS::setBuffers(cudaPitchedPtr& _D_volumeData,
+ cudaPitchedPtr& _D_projData)
+{
+ D_volumeData = _D_volumeData;
+ D_sinoData = _D_projData;
+
+ fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr);
+
+ sliceInitialized = false;
+
+ return true;
+}
+
+bool CGLS::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+ if (!sliceInitialized) {
+
+ // copy sinogram
+ duplicateProjectionData(D_r, D_sinoData, dims);
+
+ // r = sino - A*x
+ if (useVolumeMask) {
+ duplicateVolumeData(D_z, D_volumeData, dims);
+ processVol3D<opMul>(D_z, D_maskData, dims);
+ callFP(D_z, D_r, -1.0f);
+ } else {
+ callFP(D_volumeData, D_r, -1.0f);
+ }
+
+ // p = A'*r
+ zeroVolumeData(D_p, dims);
+ callBP(D_p, D_r);
+ if (useVolumeMask)
+ processVol3D<opMul>(D_p, D_maskData, dims);
+
+ gamma = dotProduct3D(D_p, dims.iVolX, dims.iVolY, dims.iVolZ);
+
+ sliceInitialized = true;
+
+ }
+
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+ // w = A*p
+ zeroProjectionData(D_w, dims);
+ callFP(D_p, D_w, 1.0f);
+
+ // alpha = gamma / <w,w>
+ float ww = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV);
+ float alpha = gamma / ww;
+
+ // x += alpha*p
+ processVol3D<opAddScaled>(D_volumeData, D_p, alpha, dims);
+
+ // r -= alpha*w
+ processSino3D<opAddScaled>(D_r, D_w, -alpha, dims);
+
+ // z = A'*r
+ zeroVolumeData(D_z, dims);
+ callBP(D_z, D_r);
+ if (useVolumeMask)
+ processVol3D<opMul>(D_z, D_maskData, dims);
+
+ float beta = 1.0f / gamma;
+ gamma = dotProduct3D(D_z, dims.iVolX, dims.iVolY, dims.iVolZ);
+
+ beta *= gamma;
+
+ // p = z + beta*p
+ processVol3D<opScaleAndAdd>(D_p, D_z, beta, dims);
+ }
+
+ return true;
+}
+
+float CGLS::computeDiffNorm()
+{
+ // We can use w and z as temporary storage here since they're not
+ // used outside of iterations.
+
+ // copy sinogram to w
+ duplicateProjectionData(D_w, D_sinoData, dims);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ duplicateVolumeData(D_z, D_volumeData, dims);
+ processVol3D<opMul>(D_z, D_maskData, dims);
+ callFP(D_z, D_w, -1.0f);
+ } else {
+ callFP(D_volumeData, D_w, -1.0f);
+ }
+
+ float s = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV);
+ return sqrt(s);
+}
+
+
+bool doCGLS(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_sinoData,
+ cudaPitchedPtr& D_maskData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ unsigned int iterations)
+{
+ CGLS cgls;
+ bool ok = true;
+
+ ok &= cgls.setConeGeometry(dims, angles);
+ if (D_maskData.ptr)
+ ok &= cgls.enableVolumeMask();
+
+ if (!ok)
+ return false;
+
+ ok = cgls.init();
+ if (!ok)
+ return false;
+
+ if (D_maskData.ptr)
+ ok &= cgls.setVolumeMask(D_maskData);
+
+ ok &= cgls.setBuffers(D_volumeData, D_sinoData);
+ if (!ok)
+ return false;
+
+ ok = cgls.iterate(iterations);
+
+ return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+ SDimensions3D dims;
+ dims.iVolX = 256;
+ dims.iVolY = 256;
+ dims.iVolZ = 256;
+ dims.iProjAngles = 100;
+ dims.iProjU = 512;
+ dims.iProjV = 512;
+ dims.iRaysPerDet = 1;
+
+ SConeProjection angle[100];
+ angle[0].fSrcX = -2905.6;
+ angle[0].fSrcY = 0;
+ angle[0].fSrcZ = 0;
+
+ angle[0].fDetSX = 694.4;
+ angle[0].fDetSY = -122.4704;
+ angle[0].fDetSZ = -122.4704;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = .4784;
+ //angle[0].fDetUY = .5;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = .4784;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 100; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Src, i, i*2*M_PI/100);
+ ROTATE0(DetS, i, i*2*M_PI/100);
+ ROTATE0(DetU, i, i*2*M_PI/100);
+ ROTATE0(DetV, i, i*2*M_PI/100);
+ }
+#undef ROTATE0
+
+
+ cudaPitchedPtr volData = allocateVolumeData(dims);
+ cudaPitchedPtr projData = allocateProjectionData(dims);
+ zeroProjectionData(projData, dims);
+
+ float* pbuf = new float[100*512*512];
+ copyProjectionsFromDevice(pbuf, projData, dims);
+ copyProjectionsToDevice(pbuf, projData, dims);
+ delete[] pbuf;
+
+#if 0
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256; ++i) {
+ for (unsigned int y = 0; y < 256; ++y)
+ for (unsigned int x = 0; x < 256; ++x)
+ slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f;
+
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+ }
+ astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+#else
+
+ for (int i = 0; i < 100; ++i) {
+ char fname[32];
+ sprintf(fname, "Tiffs/%04d.png", 4*i);
+ unsigned int w,h;
+ float* bufp = loadImage(fname, w,h);
+
+ for (int j = 0; j < 512*512; ++j) {
+ float v = bufp[j];
+ if (v > 236.0f) v = 236.0f;
+ v = logf(236.0f / v);
+ bufp[j] = 256*v;
+ }
+
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice);
+ }
+
+ delete[] bufp;
+
+ }
+#endif
+
+#if 0
+ float* bufs = new float[100*512];
+
+ for (int i = 0; i < 512; ++i) {
+ cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+ char fname[20];
+ sprintf(fname, "sino%03d.png", i);
+ saveImage(fname, 100, 512, bufs);
+ }
+
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "proj%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+
+ zeroVolumeData(volData, dims);
+
+ cudaPitchedPtr maskData;
+ maskData.ptr = 0;
+
+ astraCUDA3d::doCGLS(volData, projData, maskData, dims, angle, 50);
+#if 1
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[20];
+ sprintf(fname, "vol%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+
+ return 0;
+}
+#endif
+
diff --git a/cuda/3d/cgls3d.h b/cuda/3d/cgls3d.h
new file mode 100644
index 0000000..d16b571
--- /dev/null
+++ b/cuda/3d/cgls3d.h
@@ -0,0 +1,114 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CGLS3D_H
+#define _CUDA_CGLS3D_H
+
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport CGLS : public ReconAlgo3D {
+public:
+ CGLS();
+ ~CGLS();
+
+// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+
+
+ bool enableVolumeMask();
+ bool enableSinogramMask();
+
+ // init should be called after setting all geometry
+ bool init();
+
+ // setVolumeMask should be called after init and before iterate,
+ // but only if enableVolumeMask was called before init.
+ // It may be called again after iterate.
+ bool setVolumeMask(cudaPitchedPtr& D_maskData);
+
+ // setSinogramMask should be called after init and before iterate,
+ // but only if enableSinogramMask was called before init.
+ // It may be called again after iterate.
+ bool setSinogramMask(cudaPitchedPtr& D_smaskData);
+
+
+ // setBuffers should be called after init and before iterate.
+ // It may be called again after iterate.
+ bool setBuffers(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData);
+
+
+ // set Min/Max constraints. They may be called at any time, and will affect
+ // any iterate() calls afterwards.
+ bool setMinConstraint(float fMin) { return false; }
+ bool setMaxConstraint(float fMax) { return false; }
+
+ // iterate should be called after init and setBuffers.
+ // It may be called multiple times.
+ bool iterate(unsigned int iterations);
+
+ // Compute the norm of the difference of the FP of the current reconstruction
+ // and the sinogram. (This performs one FP.)
+ // It can be called after iterate.
+ float computeDiffNorm();
+
+protected:
+ void reset();
+
+ bool useVolumeMask;
+ bool useSinogramMask;
+
+ cudaPitchedPtr D_maskData;
+ cudaPitchedPtr D_smaskData;
+
+ // Input/output
+ cudaPitchedPtr D_sinoData;
+ cudaPitchedPtr D_volumeData;
+
+ // Temporary buffers
+ cudaPitchedPtr D_r;
+ cudaPitchedPtr D_w;
+ cudaPitchedPtr D_z;
+ cudaPitchedPtr D_p;
+
+ float gamma;
+
+ bool sliceInitialized;
+};
+
+_AstraExport bool doCGLS(cudaPitchedPtr D_volumeData, unsigned int volumePitch,
+ cudaPitchedPtr D_projData, unsigned int projPitch,
+ cudaPitchedPtr D_maskData, unsigned int maskPitch,
+ const SDimensions3D& dims, const SConeProjection* projs,
+ unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
new file mode 100644
index 0000000..7f8e320
--- /dev/null
+++ b/cuda/3d/cone_bp.cu
@@ -0,0 +1,481 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "cone_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned g_MaxAngles = 1024;
+
+__constant__ float gC_Cux[g_MaxAngles];
+__constant__ float gC_Cuy[g_MaxAngles];
+__constant__ float gC_Cuz[g_MaxAngles];
+__constant__ float gC_Cuc[g_MaxAngles];
+__constant__ float gC_Cvx[g_MaxAngles];
+__constant__ float gC_Cvy[g_MaxAngles];
+__constant__ float gC_Cvz[g_MaxAngles];
+__constant__ float gC_Cvc[g_MaxAngles];
+__constant__ float gC_Cdx[g_MaxAngles];
+__constant__ float gC_Cdy[g_MaxAngles];
+__constant__ float gC_Cdz[g_MaxAngles];
+__constant__ float gC_Cdc[g_MaxAngles];
+
+
+bool bindProjDataTexture(const cudaArray* array)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneProjTexture.filterMode = cudaFilterModeLinear;
+ gT_coneProjTexture.normalized = false;
+
+ cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+
+__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+
+ const float fCux = gC_Cux[angle];
+ const float fCuy = gC_Cuy[angle];
+ const float fCuz = gC_Cuz[angle];
+ const float fCuc = gC_Cuc[angle];
+ const float fCvx = gC_Cvx[angle];
+ const float fCvy = gC_Cvy[angle];
+ const float fCvz = gC_Cvz[angle];
+ const float fCvc = gC_Cvc[angle];
+ const float fCdx = gC_Cdx[angle];
+ const float fCdy = gC_Cdy[angle];
+ const float fCdz = gC_Cdz[angle];
+ const float fCdc = gC_Cdc[angle];
+
+ const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
+ const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
+ const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz;
+
+ const float fU = fUNum / fDen + 1.0f;
+ const float fV = fVNum / fDen + 1.0f;
+
+ fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+ }
+
+}
+
+// supersampling version
+__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+ const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+
+ const float fCux = gC_Cux[angle];
+ const float fCuy = gC_Cuy[angle];
+ const float fCuz = gC_Cuz[angle];
+ const float fCuc = gC_Cuc[angle];
+ const float fCvx = gC_Cvx[angle];
+ const float fCvy = gC_Cvy[angle];
+ const float fCvz = gC_Cvz[angle];
+ const float fCvc = gC_Cvc[angle];
+ const float fCdx = gC_Cdx[angle];
+ const float fCdy = gC_Cdy[angle];
+ const float fCdz = gC_Cdz[angle];
+ const float fCdc = gC_Cdc[angle];
+
+ float fXs = fX;
+ for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) {
+ float fYs = fY;
+ for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) {
+ float fZs = fZ;
+ for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) {
+
+ const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
+ const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
+ const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz;
+
+ const float fU = fUNum / fDen + 1.0f;
+ const float fV = fVNum / fDen + 1.0f;
+
+ fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+ fZs += fSubStep;
+ }
+ fYs += fSubStep;
+ }
+ fXs += fSubStep;
+ }
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+ }
+
+}
+
+
+bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SConeProjection* angles)
+{
+ bindProjDataTexture(D_projArray);
+
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVY - (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVZ , Cux );
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVZ -(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVX , Cuy );
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVY , Cuz );
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fSrcX - (angles[i].fDetSX*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVX)*angles[i].fSrcY + (angles[i].fDetSX*angles[i].fDetVY - angles[i].fDetSY*angles[i].fDetVX)*angles[i].fSrcZ , Cuc );
+
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUZ-(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUY, Cvx );
+ TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUZ , Cvy );
+ TRANSFER_TO_CONSTANT((angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUY-(angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUX , Cvz );
+ TRANSFER_TO_CONSTANT( -(angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fSrcX + (angles[i].fDetSX*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUX)*angles[i].fSrcY - (angles[i].fDetSX*angles[i].fDetUY - angles[i].fDetSY*angles[i].fDetUX)*angles[i].fSrcZ , Cvc );
+
+ TRANSFER_TO_CONSTANT( angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY , Cdx );
+ TRANSFER_TO_CONSTANT( angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ , Cdy );
+ TRANSFER_TO_CONSTANT( angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX , Cdz );
+ TRANSFER_TO_CONSTANT( -angles[i].fSrcX * (angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY) - angles[i].fSrcY * (angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ) - angles[i].fSrcZ * (angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX) , Cdc );
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+ dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+ // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
+ if (dims.iRaysPerVoxelDim == 1)
+ dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+ else
+ dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+ }
+
+ cudaTextForceKernelsCompletion();
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+bool ConeBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles)
+{
+ // transfer projections to array
+
+ cudaArray* cuArray = allocateProjectionArray(dims);
+ transferProjectionsToArray(D_projData, cuArray, dims);
+
+ bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+ SDimensions3D dims;
+ dims.iVolX = 256;
+ dims.iVolY = 256;
+ dims.iVolZ = 256;
+ dims.iProjAngles = 180;
+ dims.iProjU = 512;
+ dims.iProjV = 512;
+ dims.iRaysPerDet = 1;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&volData, extentV);
+
+ cudaExtent extentP;
+ extentP.width = dims.iProjU*sizeof(float);
+ extentP.height = dims.iProjAngles;
+ extentP.depth = dims.iProjV;
+
+ cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&projData, extentP);
+ cudaMemset3D(projData, 0, extentP);
+
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 1.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+#if 0
+ if (i == 128) {
+ for (unsigned int j = 0; j < 256*256; ++j)
+ slice[j] = 0.0f;
+ }
+#endif
+ }
+
+
+ SConeProjection angle[180];
+ angle[0].fSrcX = -1536;
+ angle[0].fSrcY = 0;
+ angle[0].fSrcZ = 0;
+
+ angle[0].fDetSX = 512;
+ angle[0].fDetSY = -256;
+ angle[0].fDetSZ = -256;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = 1;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 180; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Src, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ ROTATE0(DetV, i, i*2*M_PI/180);
+ }
+#undef ROTATE0
+
+ astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+#if 0
+ float* bufs = new float[180*512];
+
+ for (int i = 0; i < 512; ++i) {
+ cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+ char fname[20];
+ sprintf(fname, "sino%03d.png", i);
+ saveImage(fname, 180, 512, bufs);
+ }
+
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 180; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "proj%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 0.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+ }
+
+ astraCUDA3d::ConeBP(volData, projData, dims, angle);
+#if 0
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize);
+
+ char fname[20];
+ sprintf(fname, "vol%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+
+}
+#endif
diff --git a/cuda/3d/cone_bp.h b/cuda/3d/cone_bp.h
new file mode 100644
index 0000000..c77714e
--- /dev/null
+++ b/cuda/3d/cone_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_BP_H
+#define _CUDA_CONE_BP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SConeProjection* angles);
+
+_AstraExport bool ConeBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles);
+
+
+}
+
+#endif
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
new file mode 100644
index 0000000..40dca4f
--- /dev/null
+++ b/cuda/3d/cone_fp.cu
@@ -0,0 +1,513 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneVolumeTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 64;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_SrcZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+bool bindVolumeDataTexture(const cudaArray* array)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneVolumeTexture.filterMode = cudaFilterModeLinear;
+ gT_coneVolumeTexture.normalized = false;
+
+ cudaBindTextureToArray(gT_coneVolumeTexture, array, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+ // threadIdx: x = ??? detector (u?)
+ // y = relative angle
+
+ // blockIdx: x = ??? detector (u+v?)
+ // y = angle block
+
+
+#define CONE_FP_BODY(c0,c1,c2) \
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \
+ if (angle >= endAngle) \
+ return; \
+ \
+ const float fSrcX = gC_SrcX[angle]; \
+ const float fSrcY = gC_SrcY[angle]; \
+ const float fSrcZ = gC_SrcZ[angle]; \
+ const float fDetUX = gC_DetUX[angle]; \
+ const float fDetUY = gC_DetUY[angle]; \
+ const float fDetUZ = gC_DetUZ[angle]; \
+ const float fDetVX = gC_DetVX[angle]; \
+ const float fDetVY = gC_DetVY[angle]; \
+ const float fDetVZ = gC_DetVZ[angle]; \
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \
+ \
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \
+ int endDetectorV = startDetectorV + g_detBlockV; \
+ if (endDetectorV > dims.iProjV) \
+ endDetectorV = dims.iProjV; \
+ \
+ int endSlice = startSlice + g_blockSlices; \
+ if (endSlice > dims.iVol##c0) \
+ endSlice = dims.iVol##c0; \
+ \
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \
+ { \
+ /* Trace ray from Src to (detectorU,detectorV) from */ \
+ /* X = startSlice to X = endSlice */ \
+ \
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \
+ \
+ /* (x) ( 1) ( 0) */ \
+ /* ray: (y) = (ay) * x + (by) */ \
+ /* (z) (az) (bz) */ \
+ \
+ const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \
+ const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \
+ const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \
+ const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \
+ \
+ const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \
+ \
+ float fVal = 0.0f; \
+ \
+ float f##c0 = startSlice + 1.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+ \
+ for (int s = startSlice; s < endSlice; ++s) \
+ { \
+ fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \
+ f##c0 += 1.0f; \
+ f##c1 += a##c1; \
+ f##c2 += a##c2; \
+ } \
+ \
+ fVal *= fDistCorr; \
+ \
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \
+ }
+
+#define CONE_FP_SS_BODY(c0,c1,c2) \
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \
+ if (angle >= endAngle) \
+ return; \
+ \
+ const float fSrcX = gC_SrcX[angle]; \
+ const float fSrcY = gC_SrcY[angle]; \
+ const float fSrcZ = gC_SrcZ[angle]; \
+ const float fDetUX = gC_DetUX[angle]; \
+ const float fDetUY = gC_DetUY[angle]; \
+ const float fDetUZ = gC_DetUZ[angle]; \
+ const float fDetVX = gC_DetVX[angle]; \
+ const float fDetVY = gC_DetVY[angle]; \
+ const float fDetVZ = gC_DetVZ[angle]; \
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \
+ \
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \
+ int endDetectorV = startDetectorV + g_detBlockV; \
+ if (endDetectorV > dims.iProjV) \
+ endDetectorV = dims.iProjV; \
+ \
+ int endSlice = startSlice + g_blockSlices; \
+ if (endSlice > dims.iVolX) \
+ endSlice = dims.iVolX; \
+ \
+ const float fSubStep = 1.0f/dims.iRaysPerDetDim; \
+ \
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \
+ { \
+ /* Trace ray from Src to (detectorU,detectorV) from */ \
+ /* X = startSlice to X = endSlice */ \
+ \
+ float fV = 0.0f; \
+ \
+ float fdU = detectorU - 0.5f + 0.5f*fSubStep; \
+ for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \
+ float fdV = detectorV - 0.5f + 0.5f*fSubStep; \
+ for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \
+ \
+ const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \
+ const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \
+ const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \
+ \
+ /* (x) ( 1) ( 0) */ \
+ /* ray: (y) = (ay) * x + (by) */ \
+ /* (z) (az) (bz) */ \
+ \
+ const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \
+ const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \
+ const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \
+ const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \
+ \
+ const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \
+ \
+ float fVal = 0.0f; \
+ \
+ float f##c0 = startSlice + 1.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+ \
+ for (int s = startSlice; s < endSlice; ++s) \
+ { \
+ fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \
+ f##c0 += 1.0f; \
+ f##c1 += a##c1; \
+ f##c2 += a##c2; \
+ } \
+ \
+ fVal *= fDistCorr; \
+ fV += fVal; \
+ \
+ } \
+ } \
+ \
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\
+ }
+
+
+
+
+
+__global__ void FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(X,Y,Z)
+}
+
+__global__ void FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(Y,X,Z)
+}
+
+__global__ void FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(Z,X,Y)
+}
+
+
+__global__ void FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(X,Y,Z)
+}
+
+__global__ void FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(Y,X,Z)
+}
+
+__global__ void FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(Z,X,Y)
+}
+
+
+
+bool ConeFP_Array(cudaArray *D_volArray,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ float fOutputScale)
+{
+ bindVolumeDataTexture(D_volArray);
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(SrcX);
+ TRANSFER_TO_CONSTANT(SrcY);
+ TRANSFER_TO_CONSTANT(SrcZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ int dir;
+ if (a != dims.iProjAngles) {
+ float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f));
+ float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f));
+ float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f));
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == dims.iProjAngles || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaTextForceKernelsCompletion();
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+bool ConeFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ float fOutputScale)
+{
+ // transfer volume to array
+
+ cudaArray* cuArray = allocateVolumeArray(dims);
+ transferVolumeToArray(D_volumeData, cuArray, dims);
+
+ bool ret = ConeFP_Array(cuArray, D_projData, dims, angles, fOutputScale);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+ SDimensions3D dims;
+ dims.iVolX = 256;
+ dims.iVolY = 256;
+ dims.iVolZ = 256;
+ dims.iProjAngles = 32;
+ dims.iProjU = 512;
+ dims.iProjV = 512;
+ dims.iRaysPerDet = 1;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&volData, extentV);
+
+ cudaExtent extentP;
+ extentP.width = dims.iProjU*sizeof(float);
+ extentP.height = dims.iProjV;
+ extentP.depth = dims.iProjAngles;
+
+ cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&projData, extentP);
+ cudaMemset3D(projData, 0, extentP);
+
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 1.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaError err = cudaMemcpy3D(&p);
+ assert(!err);
+ }
+
+
+ SConeProjection angle[32];
+ angle[0].fSrcX = -1536;
+ angle[0].fSrcY = 0;
+ angle[0].fSrcZ = 200;
+
+ angle[0].fDetSX = 512;
+ angle[0].fDetSY = -256;
+ angle[0].fDetSZ = -256;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = 1;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 32; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Src, i, i*1*M_PI/180);
+ ROTATE0(DetS, i, i*1*M_PI/180);
+ ROTATE0(DetU, i, i*1*M_PI/180);
+ ROTATE0(DetV, i, i*1*M_PI/180);
+ }
+#undef ROTATE0
+
+ astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+ float* buf = new float[512*512];
+
+ cudaMemcpy(buf, ((float*)projData.ptr)+512*512*8, 512*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+ saveImage("proj.png", 512, 512, buf);
+
+
+}
+#endif
diff --git a/cuda/3d/cone_fp.h b/cuda/3d/cone_fp.h
new file mode 100644
index 0000000..2a0463b
--- /dev/null
+++ b/cuda/3d/cone_fp.h
@@ -0,0 +1,46 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_FP_H
+#define _CUDA_CONE_FP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool ConeFP_Array(cudaArray *D_volArray,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ float fOutputScale);
+
+_AstraExport bool ConeFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu
new file mode 100644
index 0000000..68330a1
--- /dev/null
+++ b/cuda/3d/darthelper3d.cu
@@ -0,0 +1,229 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util3d.h"
+#include "dims3d.h"
+#include "darthelper3d.h"
+#include <cassert>
+
+namespace astraCUDA3d {
+
+
+ // -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ __global__ void devDartSmoothing(cudaPitchedPtr out, cudaPitchedPtr in, float b, SDimensions3D dims)
+ {
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) {
+
+ float* d = (float*)in.ptr;
+ float* m = (float*)out.ptr;
+
+ unsigned int index;
+ unsigned int p = (out.pitch >> 2);
+
+ for (unsigned int z = 0; z <= dims.iVolZ-1; z++) {
+
+ float res = 0.0f;
+
+ // bottom slice
+ if (z > 0) {
+ index = ((z-1)*dims.iVolY + y) * p + x;
+ res += d[index-p-1] + d[index-p] + d[index-p+1] +
+ d[index -1] + d[index ] + d[index +1] +
+ d[index+p-1] + d[index+p] + d[index+p+1];
+ }
+
+ // top slice
+ if (z < dims.iVolZ-1) {
+ index = ((z+1)*dims.iVolY + y) * p + x;
+ res += d[index-p-1] + d[index-p] + d[index-p+1] +
+ d[index -1] + d[index ] + d[index +1] +
+ d[index+p-1] + d[index+p] + d[index+p+1];
+ }
+
+ // same slice
+ index = (z*dims.iVolY + y) * p + x;
+ res += d[index-p-1] + d[index-p] + d[index-p+1] +
+ d[index -1] + d[index +1] +
+ d[index+p-1] + d[index+p] + d[index+p+1];
+
+ // result
+ m[index] = (1.0f-b) * d[index] + b * 0.038461538f * res;
+
+ }
+
+ }
+ }
+
+ // -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims)
+ {
+ cudaPitchedPtr D_inData;
+ D_inData = allocateVolumeData(dims);
+ copyVolumeToDevice(in, D_inData, dims);
+
+ cudaPitchedPtr D_outData;
+ D_outData = allocateVolumeData(dims);
+ copyVolumeToDevice(out, D_outData, dims);
+
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16);
+
+ devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, dims);
+
+ copyVolumeFromDevice(out, D_outData, dims);
+
+ cudaFree(D_outData.ptr);
+ cudaFree(D_inData.ptr);
+
+ }
+
+
+ // -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ // CUDA function for the masking of DART with a radius == 1
+ __global__ void devDartMasking(cudaPitchedPtr mask, cudaPitchedPtr in, unsigned int conn, SDimensions3D dims)
+ {
+ unsigned int x = threadIdx.x + 16*blockIdx.x;
+ unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+ // Sacrifice the border pixels to simplify the implementation.
+ if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) {
+
+ float* d = (float*)in.ptr;
+ float* m = (float*)mask.ptr;
+
+ unsigned int index;
+ unsigned int p = (in.pitch >> 2);
+
+ for (unsigned int z = 0; z <= dims.iVolZ-1; z++) {
+
+ unsigned int o2 = (z*dims.iVolY + y) * p + x;
+
+ m[o2] = 0.0f;
+
+ // bottom slice
+ if (z > 0) {
+ index = ((z-1)*dims.iVolY + y) * p + x;
+ if ((conn == 26 &&
+ (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] ||
+ d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] ||
+ d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+ ||
+ (conn == 6 && d[index] != d[o2]))
+ {
+ m[o2] = 1.0f;
+ continue;
+ }
+ }
+
+ // top slice
+ if (z < dims.iVolZ-1) {
+ index = ((z+1)*dims.iVolY + y) * p + x;
+ if ((conn == 26 &&
+ (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] ||
+ d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] ||
+ d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+ ||
+ (conn == 6 && d[index] != d[o2]))
+ {
+ m[o2] = 1.0f;
+ continue;
+ }
+ }
+
+ // other slices
+ index = (z*dims.iVolY + y) * p + x;
+ if ((conn == 26 &&
+ (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] ||
+ d[index -1] != d[o2] || d[index +1] != d[o2] ||
+ d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+ ||
+ (conn == 6 &&
+ ( d[index-p] != d[o2] ||
+ d[index -1] != d[o2] || d[index +1] != d[o2] ||
+ d[index+p] != d[o2] )))
+ {
+ m[o2] = 1.0f;
+ continue;
+ }
+
+ }
+
+ }
+ }
+
+
+
+ // -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ void dartMasking(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims)
+ {
+ cudaPitchedPtr D_maskData;
+ D_maskData = allocateVolumeData(dims);
+ copyVolumeToDevice(mask, D_maskData, dims);
+
+ cudaPitchedPtr D_segmentationData;
+ D_segmentationData = allocateVolumeData(dims);
+ copyVolumeToDevice(segmentation, D_segmentationData, dims);
+
+ dim3 blockSize(16,16);
+ dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16);
+
+ if (threshold == 1 && radius == 1)
+ devDartMasking<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, dims);
+ //else if (threshold > 1 && radius == 1)
+ // devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+ //else if (threshold == 1 && radius > 1)
+ // devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+ //else
+ // devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+
+ copyVolumeFromDevice(mask, D_maskData, dims);
+
+ cudaFree(D_maskData.ptr);
+ cudaFree(D_segmentationData.ptr);
+
+ }
+ // -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+ bool setGPUIndex(int iGPUIndex)
+ {
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+
+ return true;
+ }
+
+
+}
diff --git a/cuda/3d/darthelper3d.h b/cuda/3d/darthelper3d.h
new file mode 100644
index 0000000..7899629
--- /dev/null
+++ b/cuda/3d/darthelper3d.h
@@ -0,0 +1,46 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DARTHELPER3_H
+#define _CUDA_DARTHELPER3_H
+
+#include <cuda.h>
+#include <driver_types.h>
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+ void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims);
+ void dartMasking(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims);
+
+ bool setGPUIndex(int index);
+
+}
+
+#endif
diff --git a/cuda/3d/dims3d.h b/cuda/3d/dims3d.h
new file mode 100644
index 0000000..ec3c4a3
--- /dev/null
+++ b/cuda/3d/dims3d.h
@@ -0,0 +1,84 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_DIMS_H
+#define _CUDA_CONE_DIMS_H
+
+namespace astra {
+
+struct SConeProjection {
+ // the source
+ double fSrcX, fSrcY, fSrcZ;
+
+ // the origin ("bottom left") of the (flat-panel) detector
+ double fDetSX, fDetSY, fDetSZ;
+
+ // the U-edge of a detector pixel
+ double fDetUX, fDetUY, fDetUZ;
+
+ // the V-edge of a detector pixel
+ double fDetVX, fDetVY, fDetVZ;
+};
+
+struct SPar3DProjection {
+ // the ray direction
+ double fRayX, fRayY, fRayZ;
+
+ // the origin ("bottom left") of the (flat-panel) detector
+ double fDetSX, fDetSY, fDetSZ;
+
+ // the U-edge of a detector pixel
+ double fDetUX, fDetUY, fDetUZ;
+
+ // the V-edge of a detector pixel
+ double fDetVX, fDetVY, fDetVZ;
+};
+
+}
+
+
+namespace astraCUDA3d {
+
+using astra::SConeProjection;
+using astra::SPar3DProjection;
+
+struct SDimensions3D {
+ unsigned int iVolX;
+ unsigned int iVolY;
+ unsigned int iVolZ;
+ unsigned int iProjAngles;
+ unsigned int iProjU; // number of detectors in the U direction
+ unsigned int iProjV; // number of detectors in the V direction
+ unsigned int iRaysPerDetDim;
+ unsigned int iRaysPerVoxelDim;
+};
+
+}
+
+#endif
+
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
new file mode 100644
index 0000000..ad0604c
--- /dev/null
+++ b/cuda/3d/fdk.cu
@@ -0,0 +1,646 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "cone_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+#include "../2d/fft.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned int g_anglesPerWeightBlock = 16;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 2048;
+
+__constant__ float gC_angle_sin[g_MaxAngles];
+__constant__ float gC_angle_cos[g_MaxAngles];
+__constant__ float gC_angle[g_MaxAngles];
+
+
+// per-detector u/v shifts?
+
+static bool bindProjDataTexture(const cudaArray* array)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_coneProjTexture.filterMode = cudaFilterModeLinear;
+ gT_coneProjTexture.normalized = false;
+
+ cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+
+__global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fInvDetUSize, float fInvDetVSize, const SDimensions3D dims)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X > dims.iVolX)
+ return;
+ if (Y > dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ;
+
+ const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f;
+ const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ);
+
+ // Note re. fZ/rV_base: the computations below are all relative to the
+ // optical axis, so we do the Z-adjustments beforehand.
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+
+ const float cos_theta = gC_angle_cos[angle];
+ const float sin_theta = gC_angle_sin[angle];
+
+ const float fR = fSrcOrigin;
+ const float fD = fR - fX * sin_theta + fY * cos_theta;
+ float fWeight = fR / fD;
+ fWeight *= fWeight;
+
+ const float fScaleFactor = (fR + fDetOrigin) / fD;
+ const float fU = fU_base + (fX*cos_theta+fY*sin_theta) * fScaleFactor * fInvDetUSize;
+ const float fV = fV_base + fZ * fScaleFactor * fInvDetVSize;
+
+ fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+// projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU] = 10.0f;
+// if (threadIdx.x == 0 && threadIdx.y == 0) { printf("%d,%d,%d [%d / %d] -> %f\n", angle, detectorU, detectorV, (angle*dims.iProjV+detectorV)*projPitch+detectorU, projPitch, projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU]); }
+ }
+
+}
+
+
+bool FDK_BP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ float fSrcOrigin, float fDetOrigin,
+ float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+ const SDimensions3D& dims, const float* angles)
+{
+ // transfer projections to array
+
+ cudaArray* cuArray = allocateProjectionArray(dims);
+ transferProjectionsToArray(D_projData, cuArray, dims);
+
+ bindProjDataTexture(cuArray);
+
+ float* angle_sin = new float[dims.iProjAngles];
+ float* angle_cos = new float[dims.iProjAngles];
+
+ for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
+ angle_sin[i] = sinf(angles[i]);
+ angle_cos[i] = cosf(angles[i]);
+ }
+ cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+ assert(e1 == cudaSuccess);
+ assert(e2 == cudaSuccess);
+
+ delete[] angle_sin;
+ delete[] angle_cos;
+
+ dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+ dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+ devBP_FDK<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, 1.0f / fDetUSize, 1.0f / fDetVSize, dims);
+ }
+
+ cudaTextForceKernelsCompletion();
+
+ cudaFreeArray(cuArray);
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, const SDimensions3D dims)
+{
+ float* projData = (float*)D_projData;
+ int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ // We need the length of the central ray and the length of the ray(s) to
+ // our detector pixel(s).
+
+ const float fCentralRayLength = fSrcOrigin + fDetOrigin;
+
+ const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize;
+
+ const float fT = fCentralRayLength * fCentralRayLength + fU * fU;
+
+ float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fDetZ - fSrcZ;
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+ const float fRayLength = sqrtf(fT + fV * fV);
+
+ const float fWeight = fCentralRayLength / fRayLength;
+
+ projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight;
+
+ fV += 1.0f;
+ }
+}
+
+__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims)
+{
+ float* projData = (float*)D_projData;
+ int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y;
+ if (angle >= endAngle)
+ return;
+
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+ int endDetectorV = startDetectorV + g_detBlockV;
+ if (endDetectorV > dims.iProjV)
+ endDetectorV = dims.iProjV;
+
+ // We need the length of the central ray and the length of the projection
+ // of our ray onto the central slice
+
+ const float fCentralRayLength = fSrcOrigin + fDetOrigin;
+
+ // TODO: Detector pixel size
+ const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize;
+
+ //const float fGamma = atanf(fU / fCentralRayLength);
+ //const float fBeta = gC_angle[angle];
+ const float fGamma = atanf(fU / fCentralRayLength);
+ const float fBeta = -gC_angle[angle];
+
+ // compute the weight depending on the location in the central fan's radon
+ // space
+ float fWeight;
+
+ if (fBeta <= 0.0f) {
+ fWeight = 0.0f;
+ } else if (fBeta <= 2.0f*(fCentralFanAngle + fGamma)) {
+ fWeight = sinf((M_PI / 4.0f) * fBeta / (fCentralFanAngle + fGamma));
+ fWeight *= fWeight;
+ } else if (fBeta <= M_PI + 2*fGamma) {
+ fWeight = 1.0f;
+ } else if (fBeta <= M_PI + 2*fCentralFanAngle) {
+ fWeight = sinf((M_PI / 4.0f) * (M_PI + 2.0f*fCentralFanAngle - fBeta) / (fCentralFanAngle - fGamma));
+ fWeight *= fWeight;
+ } else {
+ fWeight = 0.0f;
+ }
+
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+ {
+
+ projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight;
+
+ }
+}
+
+
+
+// Perform the FDK pre-weighting and filtering
+bool FDK_Filter(cudaPitchedPtr D_projData,
+ cufftComplex * D_filter,
+ float fSrcOrigin, float fDetOrigin,
+ float fSrcZ, float fDetZ,
+ float fDetUSize, float fDetVSize, bool bShortScan,
+ const SDimensions3D& dims, const float* angles)
+{
+ // The pre-weighting factor for a ray is the cosine of the angle between
+ // the central line and the ray.
+
+ dim3 dimBlock(g_detBlockU, g_anglesPerWeightBlock);
+ dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+ (dims.iProjAngles+g_anglesPerWeightBlock-1)/g_anglesPerWeightBlock);
+
+ int projPitch = D_projData.pitch/sizeof(float);
+
+ devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, dims);
+
+ cudaTextForceKernelsCompletion();
+
+ if (bShortScan) {
+ // We do short-scan Parker weighting
+
+ cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, angles,
+ dims.iProjAngles*sizeof(float), 0,
+ cudaMemcpyHostToDevice);
+ assert(!e1);
+
+ // TODO: detector pixel size!
+ float fCentralFanAngle = atanf((dims.iProjU*0.5f) /
+ (fSrcOrigin + fDetOrigin));
+
+ devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fCentralFanAngle, dims);
+
+ }
+
+ cudaTextForceKernelsCompletion();
+
+
+ // The filtering is a regular ramp filter per detector line.
+
+ int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU);
+ int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount);
+
+
+
+ // We process one sinogram at a time.
+ float* D_sinoData = (float*)D_projData.ptr;
+
+ cufftComplex * D_sinoFFT = NULL;
+ allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT);
+
+ bool ok = true;
+
+ for (int v = 0; v < dims.iProjV; ++v) {
+
+ ok = runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, 0,
+ dims.iProjU, iPaddedDetCount, iHalfFFTSize,
+ D_sinoFFT);
+
+ if (!ok) break;
+
+ applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter);
+
+
+ ok = runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch,
+ 0, dims.iProjU, iPaddedDetCount, iHalfFFTSize);
+
+ if (!ok) break;
+
+ D_sinoData += (dims.iProjAngles * projPitch);
+ }
+
+ freeComplexOnDevice(D_sinoFFT);
+
+ return ok;
+}
+
+
+bool FDK(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ float fSrcOrigin, float fDetOrigin,
+ float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+ const SDimensions3D& dims, const float* angles, bool bShortScan)
+{
+ bool ok;
+ // Generate filter
+ // TODO: Check errors
+ cufftComplex * D_filter;
+ int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU);
+ int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount);
+
+ cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize];
+ memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize);
+
+ genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize);
+
+ allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter);
+ uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter);
+
+ delete [] pHostFilter;
+
+
+ // Perform filtering
+
+ ok = FDK_Filter(D_projData, D_filter, fSrcOrigin, fDetOrigin,
+ fSrcZ, fDetZ, fDetUSize, fDetVSize,
+ bShortScan, dims, angles);
+
+ // Clean up filter
+ freeComplexOnDevice(D_filter);
+
+
+ if (!ok)
+ return false;
+
+ // Perform BP
+
+ ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ,
+ fDetUSize, fDetVSize, dims, angles);
+
+ if (!ok)
+ return false;
+
+ return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+void dumpVolume(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+ float* buf = new float[dims.iVolX*dims.iVolY];
+ unsigned int pitch = data.pitch / sizeof(float);
+
+ for (int i = 0; i < dims.iVolZ; ++i) {
+ cudaMemcpy2D(buf, dims.iVolX*sizeof(float), ((float*)data.ptr)+pitch*dims.iVolY*i, data.pitch, dims.iVolX*sizeof(float), dims.iVolY, cudaMemcpyDeviceToHost);
+
+ char fname[512];
+ sprintf(fname, filespec, dims.iVolZ-i-1);
+ saveImage(fname, dims.iVolY, dims.iVolX, buf, fMin, fMax);
+ }
+}
+
+void dumpSinograms(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+ float* bufs = new float[dims.iProjAngles*dims.iProjU];
+ unsigned int pitch = data.pitch / sizeof(float);
+
+ for (int i = 0; i < dims.iProjV; ++i) {
+ cudaMemcpy2D(bufs, dims.iProjU*sizeof(float), ((float*)data.ptr)+pitch*dims.iProjAngles*i, data.pitch, dims.iProjU*sizeof(float), dims.iProjAngles, cudaMemcpyDeviceToHost);
+
+ char fname[512];
+ sprintf(fname, filespec, i);
+ saveImage(fname, dims.iProjAngles, dims.iProjU, bufs, fMin, fMax);
+ }
+}
+
+void dumpProjections(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+ float* bufp = new float[dims.iProjV*dims.iProjU];
+ unsigned int pitch = data.pitch / sizeof(float);
+
+ for (int i = 0; i < dims.iProjAngles; ++i) {
+ for (int j = 0; j < dims.iProjV; ++j) {
+ cudaMemcpy(bufp+dims.iProjU*j, ((float*)data.ptr)+pitch*dims.iProjAngles*j+pitch*i, dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[512];
+ sprintf(fname, filespec, i);
+ saveImage(fname, dims.iProjV, dims.iProjU, bufp, fMin, fMax);
+ }
+}
+
+
+
+
+int main()
+{
+#if 0
+ SDimensions3D dims;
+ dims.iVolX = 512;
+ dims.iVolY = 512;
+ dims.iVolZ = 512;
+ dims.iProjAngles = 180;
+ dims.iProjU = 1024;
+ dims.iProjV = 1024;
+ dims.iRaysPerDet = 1;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&volData, extentV);
+
+ cudaExtent extentP;
+ extentP.width = dims.iProjU*sizeof(float);
+ extentP.height = dims.iProjAngles;
+ extentP.depth = dims.iProjV;
+
+ cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&projData, extentP);
+ cudaMemset3D(projData, 0, extentP);
+
+#if 0
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 1.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+#if 0
+ if (i == 128) {
+ for (unsigned int j = 0; j < 256*256; ++j)
+ slice[j] = 0.0f;
+ }
+#endif
+ }
+#endif
+
+ SConeProjection angle[180];
+ angle[0].fSrcX = -1536;
+ angle[0].fSrcY = 0;
+ angle[0].fSrcZ = 0;
+
+ angle[0].fDetSX = 1024;
+ angle[0].fDetSY = -512;
+ angle[0].fDetSZ = 512;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = 1;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = -1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 180; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Src, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ ROTATE0(DetV, i, i*2*M_PI/180);
+ }
+#undef ROTATE0
+
+ astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+ //dumpSinograms("sino%03d.png", projData, dims, 0, 512);
+ //dumpProjections("proj%03d.png", projData, dims, 0, 512);
+
+ astraCUDA3d::zeroVolumeData(volData, dims);
+
+ float* angles = new float[dims.iProjAngles];
+ for (int i = 0; i < 180; ++i)
+ angles[i] = i*2*M_PI/180;
+
+ astraCUDA3d::FDK(volData, projData, 1536, 512, 0, 0, dims, angles);
+
+ dumpVolume("vol%03d.png", volData, dims, -20, 100);
+
+
+#else
+
+ SDimensions3D dims;
+ dims.iVolX = 1000;
+ dims.iVolY = 999;
+ dims.iVolZ = 500;
+ dims.iProjAngles = 376;
+ dims.iProjU = 1024;
+ dims.iProjV = 524;
+ dims.iRaysPerDet = 1;
+
+ float* angles = new float[dims.iProjAngles];
+ for (int i = 0; i < dims.iProjAngles; ++i)
+ angles[i] = -i*(M_PI)/360;
+
+ cudaPitchedPtr volData = astraCUDA3d::allocateVolumeData(dims);
+ cudaPitchedPtr projData = astraCUDA3d::allocateProjectionData(dims);
+ astraCUDA3d::zeroProjectionData(projData, dims);
+ astraCUDA3d::zeroVolumeData(volData, dims);
+
+ timeval t;
+ tic(t);
+
+ for (int i = 0; i < dims.iProjAngles; ++i) {
+ char fname[256];
+ sprintf(fname, "/home/wpalenst/tmp/Elke/proj%04d.png", i);
+ unsigned int w,h;
+ float* bufp = loadImage(fname, w,h);
+
+ int pitch = projData.pitch / sizeof(float);
+ for (int j = 0; j < dims.iProjV; ++j) {
+ cudaMemcpy(((float*)projData.ptr)+dims.iProjAngles*pitch*j+pitch*i, bufp+dims.iProjU*j, dims.iProjU*sizeof(float), cudaMemcpyHostToDevice);
+ }
+
+ delete[] bufp;
+ }
+ printf("Load time: %f\n", toc(t));
+
+ //dumpSinograms("sino%03d.png", projData, dims, -8.0f, 256.0f);
+ //astraCUDA3d::FDK(volData, projData, 7350, 62355, 0, 10, dims, angles);
+ //astraCUDA3d::FDK(volData, projData, 7350, -380, 0, 10, dims, angles);
+
+ tic(t);
+
+ astraCUDA3d::FDK(volData, projData, 7383.29867, 0, 0, 10, dims, angles);
+
+ printf("FDK time: %f\n", toc(t));
+ tic(t);
+
+ dumpVolume("vol%03d.png", volData, dims, -65.9f, 200.0f);
+ //dumpVolume("vol%03d.png", volData, dims, 0.0f, 256.0f);
+ printf("Save time: %f\n", toc(t));
+
+#endif
+
+
+}
+#endif
diff --git a/cuda/3d/fdk.h b/cuda/3d/fdk.h
new file mode 100644
index 0000000..5443b19
--- /dev/null
+++ b/cuda/3d/fdk.h
@@ -0,0 +1,43 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FDK_H
+#define _CUDA_FDK_H
+
+namespace astraCUDA3d {
+
+bool FDK(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ float fSrcOrigin, float fDetOrigin,
+ float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+ const SDimensions3D& dims, const float* angles, bool bShortScan);
+
+
+}
+
+#endif
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
new file mode 100644
index 0000000..872b1eb
--- /dev/null
+++ b/cuda/3d/par3d_bp.cu
@@ -0,0 +1,464 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "par3d_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_par3DProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned g_MaxAngles = 1024;
+
+__constant__ float gC_Cux[g_MaxAngles];
+__constant__ float gC_Cuy[g_MaxAngles];
+__constant__ float gC_Cuz[g_MaxAngles];
+__constant__ float gC_Cuc[g_MaxAngles];
+__constant__ float gC_Cvx[g_MaxAngles];
+__constant__ float gC_Cvy[g_MaxAngles];
+__constant__ float gC_Cvz[g_MaxAngles];
+__constant__ float gC_Cvc[g_MaxAngles];
+
+
+static bool bindProjDataTexture(const cudaArray* array)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_par3DProjTexture.filterMode = cudaFilterModeLinear;
+ gT_par3DProjTexture.normalized = false;
+
+ cudaBindTextureToArray(gT_par3DProjTexture, array, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+
+__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+
+ const float fCux = gC_Cux[angle];
+ const float fCuy = gC_Cuy[angle];
+ const float fCuz = gC_Cuz[angle];
+ const float fCuc = gC_Cuc[angle];
+ const float fCvx = gC_Cvx[angle];
+ const float fCvy = gC_Cvy[angle];
+ const float fCvz = gC_Cvz[angle];
+ const float fCvc = gC_Cvc[angle];
+
+ const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
+ const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
+
+ const float fU = fUNum + 1.0f;
+ const float fV = fVNum + 1.0f;
+
+ fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+ }
+
+}
+
+// supersampling version
+__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+ float* volData = (float*)D_volData;
+
+ int endAngle = startAngle + g_anglesPerBlock;
+ if (endAngle > dims.iProjAngles)
+ endAngle = dims.iProjAngles;
+
+ // threadIdx: x = rel x
+ // y = rel y
+
+ // blockIdx: x = x + y
+ // y = z
+
+
+ // TO TRY: precompute part of detector intersection formulas in shared mem?
+ // TO TRY: inner loop over z, gather ray values in shared mem
+
+ const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+ const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+ if (X >= dims.iVolX)
+ return;
+ if (Y >= dims.iVolY)
+ return;
+
+ const int startZ = blockIdx.y * g_volBlockZ;
+ int endZ = startZ + g_volBlockZ;
+ if (endZ > dims.iVolZ)
+ endZ = dims.iVolZ;
+
+ float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+ float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+ float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+
+ const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
+
+ for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+ {
+
+ float fVal = 0.0f;
+ float fAngle = startAngle + 0.5f;
+
+ for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+ {
+ const float fCux = gC_Cux[angle];
+ const float fCuy = gC_Cuy[angle];
+ const float fCuz = gC_Cuz[angle];
+ const float fCuc = gC_Cuc[angle];
+ const float fCvx = gC_Cvx[angle];
+ const float fCvy = gC_Cvy[angle];
+ const float fCvz = gC_Cvz[angle];
+ const float fCvc = gC_Cvc[angle];
+
+ float fXs = fX;
+ for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) {
+ float fYs = fY;
+ for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) {
+ float fZs = fZ;
+ for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) {
+
+ const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
+ const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
+
+ const float fU = fUNum + 1.0f;
+ const float fV = fVNum + 1.0f;
+
+ fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+ fZs += fSubStep;
+ }
+ fYs += fSubStep;
+ }
+ fXs += fSubStep;
+ }
+
+ }
+
+ volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+ }
+
+}
+
+bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SPar3DProjection* angles)
+{
+ bindProjDataTexture(D_projArray);
+
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+#define DENOM (angles[i].fRayX*angles[i].fDetUY*angles[i].fDetVZ - angles[i].fRayX*angles[i].fDetUZ*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetUX*angles[i].fDetVZ + angles[i].fRayY*angles[i].fDetUZ*angles[i].fDetVX + angles[i].fRayZ*angles[i].fDetUX*angles[i].fDetVY - angles[i].fRayZ*angles[i].fDetUY*angles[i].fDetVX)
+
+
+ TRANSFER_TO_CONSTANT( ( - (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)) / DENOM , Cux );
+ TRANSFER_TO_CONSTANT( ( (angles[i].fRayX*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVX)) / DENOM , Cuy );
+ TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetVX) ) / DENOM , Cuz );
+ TRANSFER_TO_CONSTANT( (-(angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fRayX + (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)*angles[i].fDetSX - (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetVX) / DENOM , Cuc );
+
+ TRANSFER_TO_CONSTANT( ((angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY) ) / DENOM , Cvx );
+ TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUX) ) / DENOM , Cvy );
+ TRANSFER_TO_CONSTANT( ((angles[i].fRayX*angles[i].fDetUY - angles[i].fRayY*angles[i].fDetUX) ) / DENOM , Cvz );
+ TRANSFER_TO_CONSTANT( ((angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fRayX - (angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY)*angles[i].fDetSX + (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetUX ) / DENOM , Cvc );
+
+#undef TRANSFER_TO_CONSTANT
+#undef DENOM
+
+ delete[] tmp;
+
+ dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+ dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+ // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
+ if (dims.iRaysPerVoxelDim == 1)
+ dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+ else
+ dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+ }
+
+ cudaTextForceKernelsCompletion();
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+bool Par3DBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles)
+{
+ // transfer projections to array
+
+ cudaArray* cuArray = allocateProjectionArray(dims);
+ transferProjectionsToArray(D_projData, cuArray, dims);
+
+ bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+ SDimensions3D dims;
+ dims.iVolX = 256;
+ dims.iVolY = 256;
+ dims.iVolZ = 256;
+ dims.iProjAngles = 180;
+ dims.iProjU = 512;
+ dims.iProjV = 512;
+ dims.iRaysPerDet = 1;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&volData, extentV);
+
+ cudaExtent extentP;
+ extentP.width = dims.iProjU*sizeof(float);
+ extentP.height = dims.iProjAngles;
+ extentP.depth = dims.iProjV;
+
+ cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+ cudaMalloc3D(&projData, extentP);
+ cudaMemset3D(projData, 0, extentP);
+
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 1.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+#if 0
+ if (i == 128) {
+ for (unsigned int j = 0; j < 256*256; ++j)
+ slice[j] = 0.0f;
+ }
+#endif
+ }
+
+
+ SPar3DProjection angle[180];
+ angle[0].fRayX = 1;
+ angle[0].fRayY = 0;
+ angle[0].fRayZ = 0;
+
+ angle[0].fDetSX = 512;
+ angle[0].fDetSY = -256;
+ angle[0].fDetSZ = -256;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = 1;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 180; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Ray, i, i*2*M_PI/180);
+ ROTATE0(DetS, i, i*2*M_PI/180);
+ ROTATE0(DetU, i, i*2*M_PI/180);
+ ROTATE0(DetV, i, i*2*M_PI/180);
+ }
+#undef ROTATE0
+
+ astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f);
+#if 1
+ float* bufs = new float[180*512];
+
+ for (int i = 0; i < 512; ++i) {
+ cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+ char fname[20];
+ sprintf(fname, "sino%03d.png", i);
+ saveImage(fname, 180, 512, bufs, 0, 512);
+ }
+
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 180; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "proj%03d.png", i);
+ saveImage(fname, 512, 512, bufp, 0, 512);
+ }
+#endif
+ for (unsigned int i = 0; i < 256*256; ++i)
+ slice[i] = 0.0f;
+ for (unsigned int i = 0; i < 256; ++i) {
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+ }
+
+ astraCUDA3d::Par3DBP(volData, projData, dims, angle);
+#if 1
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize);
+
+ char fname[20];
+ sprintf(fname, "vol%03d.png", i);
+ saveImage(fname, 256, 256, buf, 0, 60000);
+ }
+#endif
+
+}
+#endif
diff --git a/cuda/3d/par3d_bp.h b/cuda/3d/par3d_bp.h
new file mode 100644
index 0000000..399a3cb
--- /dev/null
+++ b/cuda/3d/par3d_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR3D_BP_H
+#define _CUDA_PAR3D_BP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+ cudaArray *D_projArray,
+ const SDimensions3D& dims, const SPar3DProjection* angles);
+
+_AstraExport bool Par3DBP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles);
+
+
+}
+
+#endif
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
new file mode 100644
index 0000000..6bf9037
--- /dev/null
+++ b/cuda/3d/par3d_fp.cu
@@ -0,0 +1,814 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_par3DVolumeTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 64;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_RayX[g_MaxAngles];
+__constant__ float gC_RayY[g_MaxAngles];
+__constant__ float gC_RayZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+static bool bindVolumeDataTexture(const cudaArray* array)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+ gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+ gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+ gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+ gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear;
+ gT_par3DVolumeTexture.normalized = false;
+
+ cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc);
+
+ // TODO: error value?
+
+ return true;
+}
+
+
+
+// threadIdx: x = u detector
+// y = relative angle
+// blockIdx: x = u/v detector
+// y = angle block
+
+#define PAR3D_FP_BODY(c0,c1,c2) \
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \
+ if (angle >= endAngle) \
+ return; \
+ \
+ const float fRayX = gC_RayX[angle]; \
+ const float fRayY = gC_RayY[angle]; \
+ const float fRayZ = gC_RayZ[angle]; \
+ const float fDetUX = gC_DetUX[angle]; \
+ const float fDetUY = gC_DetUY[angle]; \
+ const float fDetUZ = gC_DetUZ[angle]; \
+ const float fDetVX = gC_DetVX[angle]; \
+ const float fDetVY = gC_DetVY[angle]; \
+ const float fDetVZ = gC_DetVZ[angle]; \
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \
+ \
+ \
+ \
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \
+ int endDetectorV = startDetectorV + g_detBlockV; \
+ if (endDetectorV > dims.iProjV) \
+ endDetectorV = dims.iProjV; \
+ \
+ int endSlice = startSlice + g_blockSlices; \
+ if (endSlice > dims.iVol##c0) \
+ endSlice = dims.iVol##c0; \
+ \
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \
+ { \
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */ \
+ /* X = startSlice to X = endSlice */ \
+ \
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \
+ \
+ /* (x) ( 1) ( 0) */ \
+ /* ray: (y) = (ay) * x + (by) */ \
+ /* (z) (az) (bz) */ \
+ \
+ const float a##c1 = fRay##c1 / fRay##c0; \
+ const float a##c2 = fRay##c2 / fRay##c0; \
+ const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \
+ const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \
+ \
+ const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \
+ \
+ float fVal = 0.0f; \
+ \
+ float f##c0 = startSlice + 1.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ \
+ for (int s = startSlice; s < endSlice; ++s) \
+ { \
+ fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \
+ f##c0 += 1.0f; \
+ f##c1 += a##c1; \
+ f##c2 += a##c2; \
+ } \
+ \
+ fVal *= fDistCorr; \
+ \
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \
+ }
+
+
+
+// Supersampling version
+#define PAR3D_FP_SS_BODY(c0,c1,c2) \
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \
+ if (angle >= endAngle) \
+ return; \
+ \
+ const float fRayX = gC_RayX[angle]; \
+ const float fRayY = gC_RayY[angle]; \
+ const float fRayZ = gC_RayZ[angle]; \
+ const float fDetUX = gC_DetUX[angle]; \
+ const float fDetUY = gC_DetUY[angle]; \
+ const float fDetUZ = gC_DetUZ[angle]; \
+ const float fDetVX = gC_DetVX[angle]; \
+ const float fDetVY = gC_DetVY[angle]; \
+ const float fDetVZ = gC_DetVZ[angle]; \
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \
+ \
+ \
+ \
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \
+ int endDetectorV = startDetectorV + g_detBlockV; \
+ if (endDetectorV > dims.iProjV) \
+ endDetectorV = dims.iProjV; \
+ \
+ int endSlice = startSlice + g_blockSlices; \
+ if (endSlice > dims.iVol##c0) \
+ endSlice = dims.iVol##c0; \
+ \
+ const float fSubStep = 1.0f/dims.iRaysPerDetDim; \
+ \
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \
+ { \
+ \
+ float fV = 0.0f; \
+ \
+ float fdU = detectorU - 0.5f + 0.5f*fSubStep; \
+ for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \
+ float fdV = detectorV - 0.5f + 0.5f*fSubStep; \
+ for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \
+ \
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */ \
+ /* X = startSlice to X = endSlice */ \
+ \
+ const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \
+ const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \
+ const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \
+ \
+ /* (x) ( 1) ( 0) */ \
+ /* ray: (y) = (ay) * x + (by) */ \
+ /* (z) (az) (bz) */ \
+ \
+ const float a##c1 = fRay##c1 / fRay##c0; \
+ const float a##c2 = fRay##c2 / fRay##c0; \
+ const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \
+ const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \
+ \
+ const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \
+ \
+ float fVal = 0.0f; \
+ \
+ float f##c0 = startSlice + 1.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ \
+ for (int s = startSlice; s < endSlice; ++s) \
+ { \
+ fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \
+ f##c0 += 1.0f; \
+ f##c1 += a##c1; \
+ f##c2 += a##c2; \
+ } \
+ \
+ fVal *= fDistCorr; \
+ fV += fVal; \
+ \
+ } \
+ } \
+ \
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\
+ }
+
+
+
+__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Z,X,Y)
+}
+
+__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Z,X,Y)
+}
+
+
+__device__ float dirWeights(float fX, float fN) {
+ if (fX <= 0.5f) // outside image on left
+ return 0.0f;
+ if (fX <= 1.5f) // half outside image on left
+ return (fX - 0.5f) * (fX - 0.5f);
+ if (fX <= fN + 0.5f) { // inside image
+ float t = fX - 0.5f - floorf(fX - 0.5f);
+ return t*t + (1-t)*(1-t);
+ }
+ if (fX <= fN + 1.5f) // half outside image on right
+ return (fN + 1.5f - fX) * (fN + 1.5f - fX);
+ return 0.0f; // outside image on right
+}
+
+#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \
+ int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \
+ if (angle >= endAngle) \
+ return; \
+ \
+ const float fRayX = gC_RayX[angle]; \
+ const float fRayY = gC_RayY[angle]; \
+ const float fRayZ = gC_RayZ[angle]; \
+ const float fDetUX = gC_DetUX[angle]; \
+ const float fDetUY = gC_DetUY[angle]; \
+ const float fDetUZ = gC_DetUZ[angle]; \
+ const float fDetVX = gC_DetVX[angle]; \
+ const float fDetVY = gC_DetVY[angle]; \
+ const float fDetVZ = gC_DetVZ[angle]; \
+ const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \
+ const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \
+ const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \
+ \
+ \
+ \
+ const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \
+ const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \
+ int endDetectorV = startDetectorV + g_detBlockV; \
+ if (endDetectorV > dims.iProjV) \
+ endDetectorV = dims.iProjV; \
+ \
+ int endSlice = startSlice + g_blockSlices; \
+ if (endSlice > dims.iVol##c0) \
+ endSlice = dims.iVol##c0; \
+ \
+ for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \
+ { \
+ /* Trace ray in direction Ray to (detectorU,detectorV) from */ \
+ /* X = startSlice to X = endSlice */ \
+ \
+ const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \
+ const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \
+ const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \
+ \
+ /* (x) ( 1) ( 0) */ \
+ /* ray: (y) = (ay) * x + (by) */ \
+ /* (z) (az) (bz) */ \
+ \
+ const float a##c1 = fRay##c1 / fRay##c0; \
+ const float a##c2 = fRay##c2 / fRay##c0; \
+ const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \
+ const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \
+ \
+ const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \
+ \
+ float fVal = 0.0f; \
+ \
+ float f##c0 = startSlice + 1.5f; \
+ float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+ float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+ \
+ for (int s = startSlice; s < endSlice; ++s) \
+ { \
+ fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \
+ f##c0 += 1.0f; \
+ f##c1 += a##c1; \
+ f##c2 += a##c2; \
+ } \
+ \
+ D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \
+ }
+
+// Supersampling version
+// TODO
+
+
+__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Z,X,Y)
+}
+
+
+
+bool Par3DFP_Array(cudaArray *D_volArray,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale)
+{
+
+ bindVolumeDataTexture(D_volArray);
+
+
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(RayX);
+ TRANSFER_TO_CONSTANT(RayY);
+ TRANSFER_TO_CONSTANT(RayZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ int dir;
+ if (a != dims.iProjAngles) {
+ float dX = fabsf(angles[a].fRayX);
+ float dY = fabsf(angles[a].fRayY);
+ float dZ = fabsf(angles[a].fRayZ);
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == dims.iProjAngles || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+ par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaTextForceKernelsCompletion();
+
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+bool Par3DFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale)
+{
+ // transfer volume to array
+ cudaArray* cuArray = allocateVolumeArray(dims);
+ transferVolumeToArray(D_volumeData, cuArray, dims);
+
+ bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale);
+
+ cudaFreeArray(cuArray);
+
+ return ret;
+}
+
+
+
+bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale)
+{
+ // transfer angles to constant memory
+ float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+ TRANSFER_TO_CONSTANT(RayX);
+ TRANSFER_TO_CONSTANT(RayY);
+ TRANSFER_TO_CONSTANT(RayZ);
+ TRANSFER_TO_CONSTANT(DetSX);
+ TRANSFER_TO_CONSTANT(DetSY);
+ TRANSFER_TO_CONSTANT(DetSZ);
+ TRANSFER_TO_CONSTANT(DetUX);
+ TRANSFER_TO_CONSTANT(DetUY);
+ TRANSFER_TO_CONSTANT(DetUZ);
+ TRANSFER_TO_CONSTANT(DetVX);
+ TRANSFER_TO_CONSTANT(DetVY);
+ TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+ delete[] tmp;
+
+ std::list<cudaStream_t> streams;
+ dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+ // Run over all angles, grouping them into groups of the same
+ // orientation (roughly horizontal vs. roughly vertical).
+ // Start a stream of grids for each such group.
+
+ unsigned int blockStart = 0;
+ unsigned int blockEnd = 0;
+ int blockDirection = 0;
+
+ // timeval t;
+ // tic(t);
+
+ for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+ int dir;
+ if (a != dims.iProjAngles) {
+ float dX = fabsf(angles[a].fRayX);
+ float dY = fabsf(angles[a].fRayY);
+ float dZ = fabsf(angles[a].fRayZ);
+
+ if (dX >= dY && dX >= dZ)
+ dir = 0;
+ else if (dY >= dX && dY >= dZ)
+ dir = 1;
+ else
+ dir = 2;
+ }
+
+ if (a == dims.iProjAngles || dir != blockDirection) {
+ // block done
+
+ blockEnd = a;
+ if (blockStart != blockEnd) {
+
+ dim3 dimGrid(
+ ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+ // TODO: check if we can't immediately
+ // destroy the stream after use
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ streams.push_back(stream);
+
+ // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+ if (blockDirection == 0) {
+ for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ } else if (blockDirection == 1) {
+ for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ } else if (blockDirection == 2) {
+ for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+ if (dims.iRaysPerDetDim == 1)
+ par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+ else
+#if 0
+ par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+ assert(false);
+#endif
+ }
+
+ }
+
+ blockDirection = dir;
+ blockStart = a;
+ }
+ }
+
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+ cudaStreamDestroy(*iter);
+
+ streams.clear();
+
+ cudaTextForceKernelsCompletion();
+
+
+ // printf("%f\n", toc(t));
+
+ return true;
+}
+
+
+
+
+
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+ cudaSetDevice(1);
+
+
+ SDimensions3D dims;
+ dims.iVolX = 500;
+ dims.iVolY = 500;
+ dims.iVolZ = 81;
+ dims.iProjAngles = 241;
+ dims.iProjU = 600;
+ dims.iProjV = 100;
+ dims.iRaysPerDet = 1;
+
+ SPar3DProjection base;
+ base.fRayX = 1.0f;
+ base.fRayY = 0.0f;
+ base.fRayZ = 0.1f;
+
+ base.fDetSX = 0.0f;
+ base.fDetSY = -300.0f;
+ base.fDetSZ = -50.0f;
+
+ base.fDetUX = 0.0f;
+ base.fDetUY = 1.0f;
+ base.fDetUZ = 0.0f;
+
+ base.fDetVX = 0.0f;
+ base.fDetVY = 0.0f;
+ base.fDetVZ = 1.0f;
+
+ SPar3DProjection angle[dims.iProjAngles];
+
+ cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+ volData = allocateVolumeData(dims);
+
+ cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+ projData = allocateProjectionData(dims);
+
+ unsigned int ix = 500,iy = 500;
+
+ float* buf = new float[dims.iProjU*dims.iProjV];
+
+ float* slice = new float[dims.iVolX*dims.iVolY];
+ for (int i = 0; i < dims.iVolX*dims.iVolY; ++i)
+ slice[i] = 1.0f;
+
+ for (unsigned int a = 0; a < 241; a += dims.iProjAngles) {
+
+ zeroProjectionData(projData, dims);
+
+ for (int y = 0; y < iy; y += dims.iVolY) {
+ for (int x = 0; x < ix; x += dims.iVolX) {
+
+ timeval st;
+ tic(st);
+
+ for (int z = 0; z < dims.iVolZ; ++z) {
+// char sfn[256];
+// sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z);
+// float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY);
+
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = dims.iVolX*sizeof(float);
+ ptr.xsize = dims.iVolX*sizeof(float);
+ ptr.ysize = dims.iVolY;
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, z };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaError err = cudaMemcpy3D(&p);
+ assert(!err);
+// delete[] slice;
+ }
+
+ printf("Load: %f\n", toc(st));
+
+#if 0
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaPitchedPtr t;
+ t.ptr = new float[1024*1024];
+ t.pitch = 1024*4;
+ t.xsize = 1024*4;
+ t.ysize = 1024;
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = volData;
+ p.extent = extentS;
+ p.dstArray = 0;
+ p.dstPtr = t;
+ p.dstPos = zp;
+ p.kind = cudaMemcpyDeviceToHost;
+ cudaError err = cudaMemcpy3D(&p);
+ assert(!err);
+
+ char fn[32];
+ sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY);
+ saveImage(fn, 1024, 1024, (float*)t.ptr);
+ saveImage("s.png", 4096, 4096, slice);
+ delete[] (float*)t.ptr;
+#endif
+
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0)
+#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0)
+ for (int i = 0; i < dims.iProjAngles; ++i) {
+ ROTATE0(Ray, i, (a+i)*.8*M_PI/180);
+ ROTATE0(DetS, i, (a+i)*.8*M_PI/180);
+ ROTATE0(DetU, i, (a+i)*.8*M_PI/180);
+ ROTATE0(DetV, i, (a+i)*.8*M_PI/180);
+
+
+// SHIFT(Src, i, (-x+1536), (-y+1536));
+// SHIFT(DetS, i, (-x+1536), (-y+1536));
+ }
+#undef ROTATE0
+#undef SHIFT
+ tic(st);
+
+ astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f);
+
+ printf("FP: %f\n", toc(st));
+
+ }
+ }
+ for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) {
+ for (unsigned int v = 0; v < dims.iProjV; ++v)
+ cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[32];
+ sprintf(fname, "proj%03d.png", a+aa);
+ saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f);
+ }
+ }
+
+ delete[] buf;
+
+}
+#endif
diff --git a/cuda/3d/par3d_fp.h b/cuda/3d/par3d_fp.h
new file mode 100644
index 0000000..7208361
--- /dev/null
+++ b/cuda/3d/par3d_fp.h
@@ -0,0 +1,51 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR3D_FP_H
+#define _CUDA_PAR3D_FP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool Par3DFP_Array(cudaArray *D_volArray,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale);
+
+_AstraExport bool Par3DFP(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale);
+
+_AstraExport bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+ cudaPitchedPtr D_projData,
+ const SDimensions3D& dims, const SPar3DProjection* angles,
+ float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu
new file mode 100644
index 0000000..f615204
--- /dev/null
+++ b/cuda/3d/sirt3d.cu
@@ -0,0 +1,533 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sirt3d.h"
+#include "util3d.h"
+#include "arith3d.h"
+#include "cone_fp.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA3d {
+
+SIRT::SIRT() : ReconAlgo3D()
+{
+ D_maskData.ptr = 0;
+ D_smaskData.ptr = 0;
+
+ D_sinoData.ptr = 0;
+ D_volumeData.ptr = 0;
+
+ D_projData.ptr = 0;
+ D_tmpData.ptr = 0;
+
+ D_lineWeight.ptr = 0;
+ D_pixelWeight.ptr = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ useMinConstraint = false;
+ useMaxConstraint = false;
+}
+
+
+SIRT::~SIRT()
+{
+ reset();
+}
+
+void SIRT::reset()
+{
+ cudaFree(D_projData.ptr);
+ cudaFree(D_tmpData.ptr);
+ cudaFree(D_lineWeight.ptr);
+ cudaFree(D_pixelWeight.ptr);
+
+ D_maskData.ptr = 0;
+ D_smaskData.ptr = 0;
+
+ D_sinoData.ptr = 0;
+ D_volumeData.ptr = 0;
+
+ D_projData.ptr = 0;
+ D_tmpData.ptr = 0;
+
+ D_lineWeight.ptr = 0;
+ D_pixelWeight.ptr = 0;
+
+ useVolumeMask = false;
+ useSinogramMask = false;
+
+ ReconAlgo3D::reset();
+}
+
+bool SIRT::enableVolumeMask()
+{
+ useVolumeMask = true;
+ return true;
+}
+
+bool SIRT::enableSinogramMask()
+{
+ useSinogramMask = true;
+ return true;
+}
+
+
+bool SIRT::init()
+{
+ D_pixelWeight = allocateVolumeData(dims);
+ zeroVolumeData(D_pixelWeight, dims);
+
+ D_tmpData = allocateVolumeData(dims);
+ zeroVolumeData(D_tmpData, dims);
+
+ D_projData = allocateProjectionData(dims);
+ zeroProjectionData(D_projData, dims);
+
+ D_lineWeight = allocateProjectionData(dims);
+ zeroProjectionData(D_lineWeight, dims);
+
+ // We can't precompute lineWeights and pixelWeights when using a mask
+ if (!useVolumeMask && !useSinogramMask)
+ precomputeWeights();
+
+ // TODO: check if allocations succeeded
+ return true;
+}
+
+bool SIRT::setMinConstraint(float fMin)
+{
+ fMinConstraint = fMin;
+ useMinConstraint = true;
+ return true;
+}
+
+bool SIRT::setMaxConstraint(float fMax)
+{
+ fMaxConstraint = fMax;
+ useMaxConstraint = true;
+ return true;
+}
+
+bool SIRT::precomputeWeights()
+{
+ zeroProjectionData(D_lineWeight, dims);
+ if (useVolumeMask) {
+ callFP(D_maskData, D_lineWeight, 1.0f);
+ } else {
+ processVol3D<opSet>(D_tmpData, 1.0f, dims);
+ callFP(D_tmpData, D_lineWeight, 1.0f);
+ }
+ processSino3D<opInvert>(D_lineWeight, dims);
+
+ if (useSinogramMask) {
+ // scale line weights with sinogram mask to zero out masked sinogram pixels
+ processSino3D<opMul>(D_lineWeight, D_smaskData, dims);
+ }
+
+ zeroVolumeData(D_pixelWeight, dims);
+
+ if (useSinogramMask) {
+ callBP(D_pixelWeight, D_smaskData);
+ } else {
+ processSino3D<opSet>(D_projData, 1.0f, dims);
+ callBP(D_pixelWeight, D_projData);
+ }
+#if 0
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 180; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "ray%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+
+#if 0
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[20];
+ sprintf(fname, "pix%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+ processVol3D<opInvert>(D_pixelWeight, dims);
+
+ if (useVolumeMask) {
+ // scale pixel weights with mask to zero out masked pixels
+ processVol3D<opMul>(D_pixelWeight, D_maskData, dims);
+ }
+
+ return true;
+}
+
+
+bool SIRT::setVolumeMask(cudaPitchedPtr& _D_maskData)
+{
+ assert(useVolumeMask);
+
+ D_maskData = _D_maskData;
+
+ return true;
+}
+
+bool SIRT::setSinogramMask(cudaPitchedPtr& _D_smaskData)
+{
+ assert(useSinogramMask);
+
+ D_smaskData = _D_smaskData;
+
+ return true;
+}
+
+bool SIRT::setBuffers(cudaPitchedPtr& _D_volumeData,
+ cudaPitchedPtr& _D_projData)
+{
+ D_volumeData = _D_volumeData;
+ D_sinoData = _D_projData;
+
+ fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr);
+
+ return true;
+}
+
+bool SIRT::iterate(unsigned int iterations)
+{
+ shouldAbort = false;
+
+ if (useVolumeMask || useSinogramMask)
+ precomputeWeights();
+
+#if 0
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[20];
+ sprintf(fname, "pix%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+#if 0
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)D_lineWeight.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "ray%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+
+
+ // iteration
+ for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+ // copy sinogram to projection data
+ duplicateProjectionData(D_projData, D_sinoData, dims);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ duplicateVolumeData(D_tmpData, D_volumeData, dims);
+ processVol3D<opMul>(D_tmpData, D_maskData, dims);
+ callFP(D_tmpData, D_projData, -1.0f);
+ } else {
+ callFP(D_volumeData, D_projData, -1.0f);
+ }
+
+ processSino3D<opMul>(D_projData, D_lineWeight, dims);
+
+ zeroVolumeData(D_tmpData, dims);
+#if 0
+ float* bufp = new float[512*512];
+ printf("Dumping projData: %p\n", (void*)D_projData.ptr);
+ for (int i = 0; i < 180; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "diff%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+
+
+ callBP(D_tmpData, D_projData);
+#if 0
+ printf("Dumping tmpData: %p\n", (void*)D_tmpData.ptr);
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)D_tmpData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[20];
+ sprintf(fname, "add%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+
+
+ processVol3D<opAddMul>(D_volumeData, D_tmpData, D_pixelWeight, dims);
+
+ if (useMinConstraint)
+ processVol3D<opClampMin>(D_volumeData, fMinConstraint, dims);
+ if (useMaxConstraint)
+ processVol3D<opClampMax>(D_volumeData, fMaxConstraint, dims);
+ }
+
+ return true;
+}
+
+float SIRT::computeDiffNorm()
+{
+ // copy sinogram to projection data
+ duplicateProjectionData(D_projData, D_sinoData, dims);
+
+ // do FP, subtracting projection from sinogram
+ if (useVolumeMask) {
+ duplicateVolumeData(D_tmpData, D_volumeData, dims);
+ processVol3D<opMul>(D_tmpData, D_maskData, dims);
+ callFP(D_tmpData, D_projData, -1.0f);
+ } else {
+ callFP(D_volumeData, D_projData, -1.0f);
+ }
+
+ float s = dotProduct3D(D_projData, dims.iProjU, dims.iProjAngles, dims.iProjV);
+ return sqrt(s);
+}
+
+
+bool doSIRT(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_sinoData,
+ cudaPitchedPtr& D_maskData,
+ const SDimensions3D& dims, const SConeProjection* angles,
+ unsigned int iterations)
+{
+ SIRT sirt;
+ bool ok = true;
+
+ ok &= sirt.setConeGeometry(dims, angles);
+ if (D_maskData.ptr)
+ ok &= sirt.enableVolumeMask();
+
+ if (!ok)
+ return false;
+
+ ok = sirt.init();
+ if (!ok)
+ return false;
+
+ if (D_maskData.ptr)
+ ok &= sirt.setVolumeMask(D_maskData);
+
+ ok &= sirt.setBuffers(D_volumeData, D_sinoData);
+ if (!ok)
+ return false;
+
+ ok = sirt.iterate(iterations);
+
+ return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+ SDimensions3D dims;
+ dims.iVolX = 256;
+ dims.iVolY = 256;
+ dims.iVolZ = 256;
+ dims.iProjAngles = 100;
+ dims.iProjU = 512;
+ dims.iProjV = 512;
+ dims.iRaysPerDet = 1;
+
+ SConeProjection angle[100];
+ angle[0].fSrcX = -2905.6;
+ angle[0].fSrcY = 0;
+ angle[0].fSrcZ = 0;
+
+ angle[0].fDetSX = 694.4;
+ angle[0].fDetSY = -122.4704;
+ angle[0].fDetSZ = -122.4704;
+
+ angle[0].fDetUX = 0;
+ angle[0].fDetUY = .4784;
+ //angle[0].fDetUY = .5;
+ angle[0].fDetUZ = 0;
+
+ angle[0].fDetVX = 0;
+ angle[0].fDetVY = 0;
+ angle[0].fDetVZ = .4784;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+ for (int i = 1; i < 100; ++i) {
+ angle[i] = angle[0];
+ ROTATE0(Src, i, i*2*M_PI/100);
+ ROTATE0(DetS, i, i*2*M_PI/100);
+ ROTATE0(DetU, i, i*2*M_PI/100);
+ ROTATE0(DetV, i, i*2*M_PI/100);
+ }
+#undef ROTATE0
+
+
+ cudaPitchedPtr volData = allocateVolumeData(dims);
+ cudaPitchedPtr projData = allocateProjectionData(dims);
+ zeroProjectionData(projData, dims);
+
+ float* pbuf = new float[100*512*512];
+ copyProjectionsFromDevice(pbuf, projData, dims);
+ copyProjectionsToDevice(pbuf, projData, dims);
+ delete[] pbuf;
+
+#if 0
+ float* slice = new float[256*256];
+ cudaPitchedPtr ptr;
+ ptr.ptr = slice;
+ ptr.pitch = 256*sizeof(float);
+ ptr.xsize = 256*sizeof(float);
+ ptr.ysize = 256;
+
+ for (unsigned int i = 0; i < 256; ++i) {
+ for (unsigned int y = 0; y < 256; ++y)
+ for (unsigned int x = 0; x < 256; ++x)
+ slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f;
+
+ cudaExtent extentS;
+ extentS.width = dims.iVolX*sizeof(float);
+ extentS.height = dims.iVolY;
+ extentS.depth = 1;
+ cudaPos sp = { 0, 0, 0 };
+ cudaPos dp = { 0, 0, i };
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = sp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = dp;
+ p.dstPtr = volData;
+ p.extent = extentS;
+ p.kind = cudaMemcpyHostToDevice;
+ cudaMemcpy3D(&p);
+ }
+ astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+#else
+
+ for (int i = 0; i < 100; ++i) {
+ char fname[32];
+ sprintf(fname, "Tiffs/%04d.png", 4*i);
+ unsigned int w,h;
+ float* bufp = loadImage(fname, w,h);
+
+ for (int j = 0; j < 512*512; ++j) {
+ float v = bufp[j];
+ if (v > 236.0f) v = 236.0f;
+ v = logf(236.0f / v);
+ bufp[j] = 256*v;
+ }
+
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice);
+ }
+
+ delete[] bufp;
+
+ }
+#endif
+
+#if 0
+ float* bufs = new float[100*512];
+
+ for (int i = 0; i < 512; ++i) {
+ cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+ printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+ char fname[20];
+ sprintf(fname, "sino%03d.png", i);
+ saveImage(fname, 100, 512, bufs);
+ }
+
+ float* bufp = new float[512*512];
+
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 512; ++j) {
+ cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+ }
+
+ char fname[20];
+ sprintf(fname, "proj%03d.png", i);
+ saveImage(fname, 512, 512, bufp);
+ }
+#endif
+
+ zeroVolumeData(volData, dims);
+
+ cudaPitchedPtr maskData;
+ maskData.ptr = 0;
+
+ astraCUDA3d::doSIRT(volData, projData, maskData, dims, angle, 50);
+#if 1
+ float* buf = new float[256*256];
+
+ for (int i = 0; i < 256; ++i) {
+ cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+ char fname[20];
+ sprintf(fname, "vol%03d.png", i);
+ saveImage(fname, 256, 256, buf);
+ }
+#endif
+
+ return 0;
+}
+#endif
+
diff --git a/cuda/3d/sirt3d.h b/cuda/3d/sirt3d.h
new file mode 100644
index 0000000..c3752c2
--- /dev/null
+++ b/cuda/3d/sirt3d.h
@@ -0,0 +1,118 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SIRT3D_H
+#define _CUDA_SIRT3D_H
+
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport SIRT : public ReconAlgo3D {
+public:
+ SIRT();
+ ~SIRT();
+
+// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+
+
+ bool enableVolumeMask();
+ bool enableSinogramMask();
+
+ // init should be called after setting all geometry
+ bool init();
+
+ // setVolumeMask should be called after init and before iterate,
+ // but only if enableVolumeMask was called before init.
+ // It may be called again after iterate.
+ bool setVolumeMask(cudaPitchedPtr& D_maskData);
+
+ // setSinogramMask should be called after init and before iterate,
+ // but only if enableSinogramMask was called before init.
+ // It may be called again after iterate.
+ bool setSinogramMask(cudaPitchedPtr& D_smaskData);
+
+
+ // setBuffers should be called after init and before iterate.
+ // It may be called again after iterate.
+ bool setBuffers(cudaPitchedPtr& D_volumeData,
+ cudaPitchedPtr& D_projData);
+
+
+ // set Min/Max constraints. They may be called at any time, and will affect
+ // any iterate() calls afterwards.
+ bool setMinConstraint(float fMin);
+ bool setMaxConstraint(float fMax);
+
+ // iterate should be called after init and setBuffers.
+ // It may be called multiple times.
+ bool iterate(unsigned int iterations);
+
+ // Compute the norm of the difference of the FP of the current reconstruction
+ // and the sinogram. (This performs one FP.)
+ // It can be called after iterate.
+ float computeDiffNorm();
+
+protected:
+ void reset();
+ bool precomputeWeights();
+
+ bool useVolumeMask;
+ bool useSinogramMask;
+
+ bool useMinConstraint;
+ bool useMaxConstraint;
+ float fMinConstraint;
+ float fMaxConstraint;
+
+ cudaPitchedPtr D_maskData;
+ cudaPitchedPtr D_smaskData;
+
+ // Input/output
+ cudaPitchedPtr D_sinoData;
+ cudaPitchedPtr D_volumeData;
+
+ // Temporary buffers
+ cudaPitchedPtr D_projData;
+ cudaPitchedPtr D_tmpData;
+
+ // Geometry-specific precomputed data
+ cudaPitchedPtr D_lineWeight;
+ cudaPitchedPtr D_pixelWeight;
+};
+
+bool doSIRT(cudaPitchedPtr D_volumeData, unsigned int volumePitch,
+ cudaPitchedPtr D_projData, unsigned int projPitch,
+ cudaPitchedPtr D_maskData, unsigned int maskPitch,
+ const SDimensions3D& dims, const SConeProjection* projs,
+ unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
new file mode 100644
index 0000000..81ea823
--- /dev/null
+++ b/cuda/3d/util3d.cu
@@ -0,0 +1,514 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include "util3d.h"
+#include "../2d/util.h"
+
+namespace astraCUDA3d {
+
+
+cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
+{
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPitchedPtr volData;
+
+ cudaError err = cudaMalloc3D(&volData, extentV);
+ if (err != cudaSuccess) {
+ astraCUDA::reportCudaError(err);
+ fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ);
+ volData.ptr = 0;
+ // TODO: return 0 somehow?
+ }
+
+ return volData;
+}
+cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
+{
+ cudaExtent extentP;
+ extentP.width = dims.iProjU*sizeof(float);
+ extentP.height = dims.iProjAngles;
+ extentP.depth = dims.iProjV;
+
+ cudaPitchedPtr projData;
+
+ cudaError err = cudaMalloc3D(&projData, extentP);
+ if (err != cudaSuccess) {
+ astraCUDA::reportCudaError(err);
+ fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
+ projData.ptr = 0;
+ // TODO: return 0 somehow?
+ }
+
+ return projData;
+}
+bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
+{
+ char* t = (char*)D_data.ptr;
+ cudaError err;
+
+ for (unsigned int z = 0; z < dims.iVolZ; ++z) {
+ err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
+ ASTRA_CUDA_ASSERT(err);
+ t += D_data.pitch * dims.iVolY;
+ }
+ return true;
+}
+bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
+{
+ char* t = (char*)D_data.ptr;
+ cudaError err;
+
+ for (unsigned int z = 0; z < dims.iProjV; ++z) {
+ err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
+ ASTRA_CUDA_ASSERT(err);
+ t += D_data.pitch * dims.iProjAngles;
+ }
+
+ return true;
+}
+bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+ if (!pitch)
+ pitch = dims.iVolX;
+
+ cudaPitchedPtr ptr;
+ ptr.ptr = (void*)data; // const cast away
+ ptr.pitch = pitch*sizeof(float);
+ ptr.xsize = dims.iVolX*sizeof(float);
+ ptr.ysize = dims.iVolY;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = D_data;
+ p.extent = extentV;
+ p.kind = cudaMemcpyHostToDevice;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+
+bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+ if (!pitch)
+ pitch = dims.iProjU;
+
+ cudaPitchedPtr ptr;
+ ptr.ptr = (void*)data; // const cast away
+ ptr.pitch = pitch*sizeof(float);
+ ptr.xsize = dims.iProjU*sizeof(float);
+ ptr.ysize = dims.iProjAngles;
+
+ cudaExtent extentV;
+ extentV.width = dims.iProjU*sizeof(float);
+ extentV.height = dims.iProjAngles;
+ extentV.depth = dims.iProjV;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = ptr;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = D_data;
+ p.extent = extentV;
+ p.kind = cudaMemcpyHostToDevice;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+
+bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+ if (!pitch)
+ pitch = dims.iVolX;
+
+ cudaPitchedPtr ptr;
+ ptr.ptr = data;
+ ptr.pitch = pitch*sizeof(float);
+ ptr.xsize = dims.iVolX*sizeof(float);
+ ptr.ysize = dims.iVolY;
+
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_data;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = ptr;
+ p.extent = extentV;
+ p.kind = cudaMemcpyDeviceToHost;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+ if (!pitch)
+ pitch = dims.iProjU;
+
+ cudaPitchedPtr ptr;
+ ptr.ptr = data;
+ ptr.pitch = pitch*sizeof(float);
+ ptr.xsize = dims.iProjU*sizeof(float);
+ ptr.ysize = dims.iProjAngles;
+
+ cudaExtent extentV;
+ extentV.width = dims.iProjU*sizeof(float);
+ extentV.height = dims.iProjAngles;
+ extentV.depth = dims.iProjV;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_data;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = ptr;
+ p.extent = extentV;
+ p.kind = cudaMemcpyDeviceToHost;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+
+bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
+{
+ cudaExtent extentV;
+ extentV.width = dims.iVolX*sizeof(float);
+ extentV.height = dims.iVolY;
+ extentV.depth = dims.iVolZ;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_src;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = D_dst;
+ p.extent = extentV;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
+{
+ cudaExtent extentV;
+ extentV.width = dims.iProjU*sizeof(float);
+ extentV.height = dims.iProjAngles;
+ extentV.depth = dims.iProjV;
+
+ cudaPos zp = { 0, 0, 0 };
+
+ cudaMemcpy3DParms p;
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_src;
+ p.dstArray = 0;
+ p.dstPos = zp;
+ p.dstPtr = D_dst;
+ p.extent = extentV;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ cudaError err;
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ return err == cudaSuccess;
+}
+
+
+
+// TODO: Consider using a single array of size max(proj,volume) (per dim)
+// instead of allocating a new one each time
+
+// TODO: Figure out a faster way of zeroing the padding?
+
+cudaArray* allocateVolumeArray(const SDimensions3D& dims)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaArray* cuArray;
+ cudaExtent extentA;
+ extentA.width = dims.iVolX+2;
+ extentA.height = dims.iVolY+2;
+ extentA.depth = dims.iVolZ+2;
+ cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
+ if (err != cudaSuccess) {
+ astraCUDA::reportCudaError(err);
+ fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ);
+ return 0;
+ }
+
+ zeroVolumeArray(cuArray, dims);
+
+ return cuArray;
+}
+cudaArray* allocateProjectionArray(const SDimensions3D& dims)
+{
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaArray* cuArray;
+ cudaExtent extentA;
+ extentA.width = dims.iProjU+2;
+ extentA.height = dims.iProjAngles;
+ extentA.depth = dims.iProjV+2;
+ cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
+
+ if (err != cudaSuccess) {
+ astraCUDA::reportCudaError(err);
+ fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
+ return 0;
+ }
+
+ zeroProjectionArray(cuArray, dims);
+
+ return cuArray;
+}
+bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims)
+{
+ cudaPitchedPtr zeroBuf;
+ cudaExtent extentS;
+ extentS.width = sizeof(float)*(dims.iVolX+2);
+ extentS.height = dims.iVolY+2;
+ extentS.depth = 1;
+
+ cudaExtent extentA;
+ extentA.width = dims.iVolX+2;
+ extentA.height = dims.iVolY+2;
+ extentA.depth = 1;
+
+
+
+ cudaError err;
+ err = cudaMalloc3D(&zeroBuf, extentS);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2);
+ ASTRA_CUDA_ASSERT(err);
+
+ // zero array
+ for (unsigned int i = 0; i < dims.iVolZ+2; ++i) {
+ cudaMemcpy3DParms p;
+ cudaPos zp = {0, 0, 0};
+ cudaPos dp = {0, 0, i};
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = zeroBuf;
+ p.dstArray = array;
+ p.dstPtr.ptr = 0;
+ p.dstPtr.pitch = 0;
+ p.dstPtr.xsize = 0;
+ p.dstPtr.ysize = 0;
+ p.dstPos = dp;
+ p.extent = extentA;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+ }
+ cudaFree(zeroBuf.ptr);
+
+ // TODO: check errors
+
+ return true;
+}
+bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims)
+{
+ cudaPitchedPtr zeroBuf;
+ cudaExtent extentS;
+ extentS.width = sizeof(float)*(dims.iProjU+2);
+ extentS.height = dims.iProjAngles;
+ extentS.depth = 1;
+ cudaExtent extentA;
+ extentA.width = dims.iProjU+2;
+ extentA.height = dims.iProjAngles;
+ extentA.depth = 1;
+
+
+ cudaError err;
+ err = cudaMalloc3D(&zeroBuf, extentS);
+ ASTRA_CUDA_ASSERT(err);
+ err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles);
+ ASTRA_CUDA_ASSERT(err);
+
+ for (unsigned int i = 0; i < dims.iProjV+2; ++i) {
+ cudaMemcpy3DParms p;
+ cudaPos zp = {0, 0, 0};
+ cudaPos dp = {0, 0, i};
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = zeroBuf;
+ p.dstArray = array;
+ p.dstPtr.ptr = 0;
+ p.dstPtr.pitch = 0;
+ p.dstPtr.xsize = 0;
+ p.dstPtr.ysize = 0;
+ p.dstPos = dp;
+ p.extent = extentA;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+ }
+ cudaFree(zeroBuf.ptr);
+
+ // TODO: check errors
+ return true;
+}
+
+
+bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims)
+{
+ cudaExtent extentA;
+ extentA.width = dims.iVolX;
+ extentA.height = dims.iVolY;
+ extentA.depth = dims.iVolZ;
+
+ cudaMemcpy3DParms p;
+ cudaPos zp = {0, 0, 0};
+ cudaPos dp = {1, 1, 1};
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_volumeData;
+ p.dstArray = array;
+ p.dstPtr.ptr = 0;
+ p.dstPtr.pitch = 0;
+ p.dstPtr.xsize = 0;
+ p.dstPtr.ysize = 0;
+ p.dstPos = dp;
+ p.extent = extentA;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ cudaError err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+ // TODO: check errors
+
+ return true;
+}
+bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
+{
+ cudaExtent extentA;
+ extentA.width = dims.iProjU;
+ extentA.height = dims.iProjAngles;
+ extentA.depth = dims.iProjV;
+
+ cudaMemcpy3DParms p;
+ cudaPos zp = {0, 0, 0};
+ cudaPos dp = {1, 0, 1};
+ p.srcArray = 0;
+ p.srcPos = zp;
+ p.srcPtr = D_projData;
+ p.dstArray = array;
+ p.dstPtr.ptr = 0;
+ p.dstPtr.pitch = 0;
+ p.dstPtr.xsize = 0;
+ p.dstPtr.ysize = 0;
+ p.dstPos = dp;
+ p.extent = extentA;
+ p.kind = cudaMemcpyDeviceToDevice;
+
+ cudaError err = cudaMemcpy3D(&p);
+ ASTRA_CUDA_ASSERT(err);
+
+ // TODO: check errors
+
+ return true;
+}
+
+
+float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
+ unsigned int z)
+{
+ return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0);
+}
+
+
+bool cudaTextForceKernelsCompletion()
+{
+ cudaError_t returnedCudaError = cudaThreadSynchronize();
+
+ if(returnedCudaError != cudaSuccess) {
+ fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
+ return false;
+ }
+
+ return true;
+}
+
+int calcNextPowerOfTwo(int _iValue)
+{
+ int iOutput = 1;
+ while(iOutput < _iValue)
+ iOutput *= 2;
+ return iOutput;
+}
+
+}
diff --git a/cuda/3d/util3d.h b/cuda/3d/util3d.h
new file mode 100644
index 0000000..cf04a18
--- /dev/null
+++ b/cuda/3d/util3d.h
@@ -0,0 +1,69 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_UTIL3D_H
+#define _CUDA_UTIL3D_H
+
+#include <cuda.h>
+#include "dims3d.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#include "../2d/util.h"
+
+namespace astraCUDA3d {
+
+cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims);
+cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims);
+bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims);
+bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims);
+bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool duplicateVolumeData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims);
+bool duplicateProjectionData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims);
+
+
+bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims);
+bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims);
+bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims);
+bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims);
+cudaArray* allocateProjectionArray(const SDimensions3D& dims);
+cudaArray* allocateVolumeArray(const SDimensions3D& dims);
+
+bool cudaTextForceKernelsCompletion();
+
+float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z);
+
+int calcNextPowerOfTwo(int _iValue);
+
+}
+
+#endif