summaryrefslogtreecommitdiffstats
path: root/cuda/3d
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/3d')
-rw-r--r--cuda/3d/algo3d.cu4
-rw-r--r--cuda/3d/arith3d.cu40
-rw-r--r--cuda/3d/astra3d.cu4
-rw-r--r--cuda/3d/cgls3d.cu4
-rw-r--r--cuda/3d/cone_bp.cu31
-rw-r--r--cuda/3d/cone_fp.cu25
-rw-r--r--cuda/3d/darthelper3d.cu4
-rw-r--r--cuda/3d/fdk.cu10
-rw-r--r--cuda/3d/mem3d.cu42
-rw-r--r--cuda/3d/par3d_bp.cu22
-rw-r--r--cuda/3d/par3d_fp.cu41
-rw-r--r--cuda/3d/sirt3d.cu4
-rw-r--r--cuda/3d/util3d.cu107
13 files changed, 148 insertions, 190 deletions
diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu
index 3a83194..4ef2052 100644
--- a/cuda/3d/algo3d.cu
+++ b/cuda/3d/algo3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index 2f4054e..b495f22 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
index 51e76cd..3df52c8 100644
--- a/cuda/3d/astra3d.cu
+++ b/cuda/3d/astra3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu
index 4829574..cbfb422 100644
--- a/cuda/3d/cgls3d.cu
+++ b/cuda/3d/cgls3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 3525eb4..e265304 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -41,8 +41,7 @@ static texture3D gT_coneProjTexture;
namespace astraCUDA3d {
-#define ZSIZE 6
-static const unsigned int g_volBlockZ = ZSIZE;
+static const unsigned int g_volBlockZ = 6;
static const unsigned int g_anglesPerBlock = 32;
static const unsigned int g_volBlockX = 16;
@@ -77,7 +76,7 @@ bool bindProjDataTexture(const cudaArray* array)
//__launch_bounds__(32*16, 4)
-template<bool FDKWEIGHT>
+template<bool FDKWEIGHT, unsigned int ZSIZE>
__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle,
int angleOffset, const astraCUDA3d::SDimensions3D dims,
float fOutputScale)
@@ -342,15 +341,25 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
- if (params.bFDKWeighting)
- dev_cone_BP<true><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
- else if (params.iRaysPerVoxelDim == 1)
- dev_cone_BP<false><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
- else
+ if (params.bFDKWeighting) {
+ if (dims.iVolZ == 1) {
+ dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ } else {
+ dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ }
+ } else if (params.iRaysPerVoxelDim == 1) {
+ if (dims.iVolZ == 1) {
+ dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ } else {
+ dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ }
+ } else
dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index bd607fa..2c3d1f6 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -169,6 +169,8 @@ __global__ void cone_FP_t(float* D_projData, unsigned int projPitch,
const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
int endDetectorV = startDetectorV + g_detBlockV;
if (endDetectorV > dims.iProjV)
@@ -245,6 +247,8 @@ __global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch,
const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
int endDetectorV = startDetectorV + g_detBlockV;
if (endDetectorV > dims.iProjV)
@@ -402,8 +406,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -446,16 +451,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
+ bool ok = true;
- cudaTextForceKernelsCompletion();
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu
index d8ccfa6..c3b93c6 100644
--- a/cuda/3d/darthelper3d.cu
+++ b/cuda/3d/darthelper3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index 456694f..0b8d2ab 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims);
- cudaTextForceKernelsCompletion();
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight"))
+ return false;
if (bShortScan && dims.iProjAngles > 1) {
ASTRA_DEBUG("Doing Parker weighting");
@@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims);
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight"))
+ return false;
}
- cudaTextForceKernelsCompletion();
return true;
}
diff --git a/cuda/3d/mem3d.cu b/cuda/3d/mem3d.cu
index 50cfe75..ad2a0f3 100644
--- a/cuda/3d/mem3d.cu
+++ b/cuda/3d/mem3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -58,15 +58,13 @@ struct SMemHandle3D_internal
int maxBlockDimension()
{
int dev;
- cudaError_t err = cudaGetDevice(&dev);
- if (err != cudaSuccess) {
+ if (!checkCuda(cudaGetDevice(&dev), "maxBlockDimension getDevice")) {
ASTRA_WARN("Error querying device");
return 0;
}
cudaDeviceProp props;
- err = cudaGetDeviceProperties(&props, dev);
- if (err != cudaSuccess) {
+ if (!checkCuda(cudaGetDeviceProperties(&props, dev), "maxBlockDimension getDviceProps")) {
ASTRA_WARN("Error querying device %d properties", dev);
return 0;
}
@@ -84,10 +82,7 @@ MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Me
size_t free = astraCUDA::availableGPUMemory();
- cudaError_t err;
- err = cudaMalloc3D(&hnd.ptr, make_cudaExtent(sizeof(float)*x, y, z));
-
- if (err != cudaSuccess) {
+ if (!checkCuda(cudaMalloc3D(&hnd.ptr, make_cudaExtent(sizeof(float)*x, y, z)), "allocateGPUMemory malloc3d")) {
return MemHandle3D();
}
@@ -98,8 +93,7 @@ MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Me
if (zero == INIT_ZERO) {
- err = cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z));
- if (err != cudaSuccess) {
+ if (!checkCuda(cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z)), "allocateGPUMemory memset3d")) {
cudaFree(hnd.ptr.ptr);
return MemHandle3D();
}
@@ -116,23 +110,22 @@ bool zeroGPUMemory(MemHandle3D handle, unsigned int x, unsigned int y, unsigned
{
SMemHandle3D_internal& hnd = *handle.d.get();
assert(!hnd.arr);
- cudaError_t err = cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z));
- return err == cudaSuccess;
+ return checkCuda(cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z)), "zeroGPUMemory");
}
bool freeGPUMemory(MemHandle3D handle)
{
size_t free = astraCUDA::availableGPUMemory();
- cudaError_t err;
+ bool ok;
if (handle.d->arr)
- err = cudaFreeArray(handle.d->arr);
+ ok = checkCuda(cudaFreeArray(handle.d->arr), "freeGPUMemory array");
else
- err = cudaFree(handle.d->ptr.ptr);
+ ok = checkCuda(cudaFree(handle.d->ptr.ptr), "freeGPUMemory");
size_t free2 = astraCUDA::availableGPUMemory();
ASTRA_DEBUG("Freeing memory. (Pre: %lu, post: %lu)", free, free2);
- return err == cudaSuccess;
+ return ok;
}
bool copyToGPUMemory(const float *src, MemHandle3D dst, const SSubDimensions3D &pos)
@@ -160,9 +153,7 @@ bool copyToGPUMemory(const float *src, MemHandle3D dst, const SSubDimensions3D &
p.kind = cudaMemcpyHostToDevice;
- cudaError_t err = cudaMemcpy3D(&p);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyToGPUMemory");
}
@@ -197,10 +188,7 @@ bool copyFromGPUMemory(float *dst, MemHandle3D src, const SSubDimensions3D &pos)
p.kind = cudaMemcpyDeviceToHost;
- cudaError_t err = cudaMemcpy3D(&p);
-
- return err == cudaSuccess;
-
+ return checkCuda(cudaMemcpy3D(&p), "copyFromGPUMemory");
}
@@ -409,9 +397,7 @@ bool copyIntoArray(MemHandle3D handle, MemHandle3D subdata, const SSubDimensions
p.kind = cudaMemcpyHostToDevice;
- cudaError_t err = cudaMemcpy3D(&p);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyIntoArray");
}
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 857a314..1dc75ce 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -41,8 +41,7 @@ static texture3D gT_par3DProjTexture;
namespace astraCUDA3d {
-#define ZSIZE 6
-static const unsigned int g_volBlockZ = ZSIZE;
+static const unsigned int g_volBlockZ = 6;
static const unsigned int g_anglesPerBlock = 32;
static const unsigned int g_volBlockX = 16;
@@ -77,6 +76,7 @@ static bool bindProjDataTexture(const cudaArray* array)
}
+template<unsigned int ZSIZE>
__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
{
float* volData = (float*)D_volData;
@@ -281,13 +281,19 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);
- if (params.iRaysPerVoxelDim == 1)
- dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
- else
+ if (params.iRaysPerVoxelDim == 1) {
+ if (dims.iVolZ == 1) {
+ dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ } else {
+ dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+ }
+ } else
dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 0a4a5cc..5daddc1 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -175,6 +175,8 @@ __global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
int endDetectorV = startDetectorV + g_detBlockV;
if (endDetectorV > dims.iProjV)
@@ -251,7 +253,10 @@ __global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch,
const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
const float fDistCorr = sc.scale(a1, a2);
+
const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
int endDetectorV = startDetectorV + g_detBlockV;
if (endDetectorV > dims.iProjV)
@@ -359,6 +364,8 @@ __global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch,
const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+ if (detectorU >= dims.iProjU)
+ return;
const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
int endDetectorV = startDetectorV + g_detBlockV;
if (endDetectorV > dims.iProjV)
@@ -501,8 +508,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -545,17 +552,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
bool Par3DFP(cudaPitchedPtr D_volumeData,
@@ -726,17 +732,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu
index e68bde8..746a96b 100644
--- a/cuda/3d/sirt3d.cu
+++ b/cuda/3d/sirt3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 41eb9d2..71b5668 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -1,7 +1,7 @@
/*
-----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
- 2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+ 2014-2021, CWI, Amsterdam
Contact: astra@astra-toolbox.com
Website: http://www.astra-toolbox.com/
@@ -46,12 +46,9 @@ cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
cudaPitchedPtr volData;
- cudaError err = cudaMalloc3D(&volData, extentV);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3D(&volData, extentV), "allocateVolumeData 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iVolX, dims.iVolY, dims.iVolZ);
volData.ptr = 0;
- // TODO: return 0 somehow?
}
return volData;
@@ -65,12 +62,9 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
cudaPitchedPtr projData;
- cudaError err = cudaMalloc3D(&projData, extentP);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3D(&projData, extentP), "allocateProjectionData 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iProjU, dims.iProjAngles, dims.iProjV);
projData.ptr = 0;
- // TODO: return 0 somehow?
}
return projData;
@@ -78,11 +72,11 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
- cudaError err;
for (unsigned int z = 0; z < dims.iVolZ; ++z) {
- err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
- ASTRA_CUDA_ASSERT(err);
+ if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY), "zeroVolumeData 3D")) {
+ return false;
+ }
t += D_data.pitch * dims.iVolY;
}
return true;
@@ -90,11 +84,11 @@ bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
{
char* t = (char*)D_data.ptr;
- cudaError err;
for (unsigned int z = 0; z < dims.iProjV; ++z) {
- err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
- ASTRA_CUDA_ASSERT(err);
+ if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles), "zeroProjectionData 3D")) {
+ return false;
+ }
t += D_data.pitch * dims.iProjAngles;
}
@@ -128,11 +122,7 @@ bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimens
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyVolumeToDevice 3D");
}
bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -163,11 +153,7 @@ bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SD
p.extent = extentV;
p.kind = cudaMemcpyHostToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyProjectionsToDevice 3D");
}
bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -198,12 +184,9 @@ bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDime
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyVolumeFromDevice 3D");
}
+
bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
{
if (!pitch)
@@ -232,11 +215,7 @@ bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const
p.extent = extentV;
p.kind = cudaMemcpyDeviceToHost;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "copyProjectionsFromDevice 3D");
}
bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
@@ -258,12 +237,9 @@ bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, con
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "duplicateVolumeData 3D");
}
+
bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
{
cudaExtent extentV;
@@ -283,11 +259,7 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,
p.extent = extentV;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err;
- err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- return err == cudaSuccess;
+ return checkCuda(cudaMemcpy3D(&p), "duplicateProjectionData 3D");
}
@@ -303,9 +275,8 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)
extentA.width = dims.iVolX;
extentA.height = dims.iVolY;
extentA.depth = dims.iVolZ;
- cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+
+ if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateVolumeArray 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iVolX, dims.iVolY, dims.iVolZ);
return 0;
}
@@ -320,10 +291,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
extentA.width = dims.iProjU;
extentA.height = dims.iProjAngles;
extentA.depth = dims.iProjV;
- cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
- if (err != cudaSuccess) {
- astraCUDA::reportCudaError(err);
+ if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateProjectionArray 3D")) {
ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iProjU, dims.iProjAngles, dims.iProjV);
return 0;
}
@@ -352,12 +321,9 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferVolumeToArray 3D");
}
+
bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
@@ -379,13 +345,9 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
p.extent = extentA;
p.kind = cudaMemcpyDeviceToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferProjectionsToArray 3D");
}
+
bool transferHostProjectionsToArray(const float *projData, cudaArray* array, const SDimensions3D& dims)
{
cudaExtent extentA;
@@ -413,12 +375,7 @@ bool transferHostProjectionsToArray(const float *projData, cudaArray* array, con
p.extent = extentA;
p.kind = cudaMemcpyHostToDevice;
- cudaError err = cudaMemcpy3D(&p);
- ASTRA_CUDA_ASSERT(err);
-
- // TODO: check errors
-
- return true;
+ return checkCuda(cudaMemcpy3D(&p), "transferHostProjectionsToArray 3D");
}
@@ -430,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
}
-bool cudaTextForceKernelsCompletion()
-{
- cudaError_t returnedCudaError = cudaThreadSynchronize();
-
- if(returnedCudaError != cudaSuccess) {
- ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
- return false;
- }
-
- return true;
-}
-
int calcNextPowerOfTwo(int _iValue)
{
int iOutput = 1;