summaryrefslogtreecommitdiffstats
path: root/cuda/3d
diff options
context:
space:
mode:
Diffstat (limited to 'cuda/3d')
-rw-r--r--cuda/3d/arith3d.cu36
-rw-r--r--cuda/3d/cone_bp.cu4
-rw-r--r--cuda/3d/cone_fp.cu17
-rw-r--r--cuda/3d/fdk.cu6
-rw-r--r--cuda/3d/par3d_bp.cu4
-rw-r--r--cuda/3d/par3d_fp.cu30
-rw-r--r--cuda/3d/util3d.cu12
7 files changed, 51 insertions, 58 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index fbaa50c..b495f22 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
@@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
pfOut += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
pfIn += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
template<typename op>
@@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
pfIn2 += step;
}
- cudaTextForceKernelsCompletion();
+ checkCuda(cudaThreadSynchronize(), __FUNCTION__);
}
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 7c3fc8d..e265304 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -357,7 +357,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index 4937d24..fede53b 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -402,8 +402,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -446,16 +447,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
+ bool ok = true;
- cudaTextForceKernelsCompletion();
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index 7b36c93..0b8d2ab 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims);
- cudaTextForceKernelsCompletion();
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight"))
+ return false;
if (bShortScan && dims.iProjAngles > 1) {
ASTRA_DEBUG("Doing Parker weighting");
@@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims);
+ if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight"))
+ return false;
}
- cudaTextForceKernelsCompletion();
return true;
}
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index d356b9f..1dc75ce 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -291,7 +291,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
}
- cudaTextForceKernelsCompletion();
+ // TODO: Consider not synchronizing here, if possible.
+ if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+ return false;
angles = angles + angleCount;
// printf("%f\n", toc(t));
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 1f58516..cf8336c 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -501,8 +501,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
dim3 dimGrid(
((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
- // TODO: check if we can't immediately
- // destroy the stream after use
+ // TODO: consider limiting number of handle (chaotic) geoms
+ // with many alternating directions
cudaStream_t stream;
cudaStreamCreate(&stream);
streams.push_back(stream);
@@ -545,17 +545,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
bool Par3DFP(cudaPitchedPtr D_volumeData,
@@ -726,17 +725,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
}
}
- for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
- cudaStreamDestroy(*iter);
-
- streams.clear();
-
- cudaTextForceKernelsCompletion();
+ bool ok = true;
+ for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+ ok = ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+ cudaStreamDestroy(*iter);
+ }
// printf("%f\n", toc(t));
- return true;
+ return ok;
}
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 4f5d134..71b5668 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -387,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
}
-bool cudaTextForceKernelsCompletion()
-{
- cudaError_t returnedCudaError = cudaThreadSynchronize();
-
- if(returnedCudaError != cudaSuccess) {
- ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
- return false;
- }
-
- return true;
-}
-
int calcNextPowerOfTwo(int _iValue)
{
int iOutput = 1;