From 39582115bc93b5435d25e56891815ae7cb1898fd Mon Sep 17 00:00:00 2001
From: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>
Date: Tue, 16 Nov 2021 13:44:13 +0100
Subject: Remove cudaTextForceKernelsCompletion

---
 cuda/2d/arith.cu               | 14 +++++++-------
 cuda/2d/fan_bp.cu              | 15 +++++----------
 cuda/2d/fan_fp.cu              | 11 ++++++-----
 cuda/2d/par_bp.cu              | 10 +++-------
 cuda/2d/par_fp.cu              | 19 ++++++++-----------
 cuda/2d/sart.cu                |  2 +-
 cuda/2d/util.cu                | 17 ++---------------
 cuda/3d/arith3d.cu             | 36 ++++++++++++++++++------------------
 cuda/3d/cone_bp.cu             |  4 +++-
 cuda/3d/cone_fp.cu             | 17 +++++++++--------
 cuda/3d/fdk.cu                 |  6 ++++--
 cuda/3d/par3d_bp.cu            |  4 +++-
 cuda/3d/par3d_fp.cu            | 30 ++++++++++++++----------------
 cuda/3d/util3d.cu              | 12 ------------
 include/astra/cuda/2d/util.h   |  5 -----
 include/astra/cuda/3d/util3d.h |  2 --
 16 files changed, 83 insertions(+), 121 deletions(-)
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index aa0edae..45622d0 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned
 
 	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi
 
 	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns
 
 	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i
 
 	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit
 
 	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa
 
 	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned
 
 	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index fc42456..2068d03 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 		else
 			devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
@@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
 		devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 // D_projData is a pointer to one padded sinogram line
@@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
 	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale);
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
-
-	return true;
+	return checkCuda(cudaThreadSynchronize(), "FanBP_SART");
 }
 
 bool FanBP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index 9bbb401..342ca4c 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
 		FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
 
-	cudaStreamDestroy(stream1);
-	cudaStreamDestroy(stream2);
+	bool ok = true;
 
-	cudaThreadSynchronize();
+	ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor");
+	cudaStreamDestroy(stream1);
 
-	cudaTextForceKernelsCompletion();
+	ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver");
+	cudaStreamDestroy(stream2);
 
 	cudaFreeArray(D_dataArray);
 
-	return true;
+	return ok;
 }
 
 bool FanFP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index b50b5a5..d7c3ab0 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 		else
 			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 bool BP(float* D_volumeData, unsigned int volumePitch,
@@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
 	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale);
-	cudaThreadSynchronize();
-
-	cudaTextForceKernelsCompletion();
 
-	return true;
+	return checkCuda(cudaThreadSynchronize(), "BP_SART");
 }
 
 
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index 8c48280..e947428 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
 				             (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
 
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaThreadSynchronize();
+	bool ok = true;
 
-	cudaTextForceKernelsCompletion();
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	cudaFreeArray(D_dataArray);
-		
 
-	return true;
+	return ok;
 }
 
 bool FP_simple(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index 29f5b43..89d58c2 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int
 
 	devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "MUL_SART");
 }
 
 
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
index a75e5ab..ac360f0 100644
--- a/cuda/2d/util.cu
+++ b/cuda/2d/util.cu
@@ -216,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch,
 	// Step 1: reduce 2D from image to a single vector, taking sum of squares
 
 	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D");
 
 	// Step 2: reduce 1D: add up elements in vector
 	if (bx * by > 512)
@@ -233,26 +233,13 @@ float dotProduct2D(float* D_data, unsigned int pitch,
 	float x;
 	cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "dotProduct2D");
 
 	cudaFree(D_buf);
 
 	return x;
 }
 
-
-bool cudaTextForceKernelsCompletion()
-{
-	cudaError_t returnedCudaError = cudaThreadSynchronize();
-
-	if(returnedCudaError != cudaSuccess) {
-		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
-		return false;
-	}
-
-	return true;
-}
-
 bool checkCuda(cudaError_t err, const char *msg)
 {
 	if (err != cudaSuccess) {
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index fbaa50c..b495f22 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
 
 	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
 
 	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
 
 	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
 
 	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 
 	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 
 	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 7c3fc8d..e265304 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -357,7 +357,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
 				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
 		}
 
-		cudaTextForceKernelsCompletion();
+		// TODO: Consider not synchronizing here, if possible.
+		if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+			return false;
 
 		angles = angles + angleCount;
 		// printf("%f\n", toc(t));
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index 4937d24..fede53b 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -402,8 +402,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
 				dim3 dimGrid(
 				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
 (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -446,16 +447,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
+	bool ok = true;
 
-	cudaTextForceKernelsCompletion();
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index 7b36c93..0b8d2ab 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
 
 	devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims);
 
-	cudaTextForceKernelsCompletion();
+	if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight"))
+		return false;
 
 	if (bShortScan && dims.iProjAngles > 1) {
 		ASTRA_DEBUG("Doing Parker weighting");
@@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
 
 		devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims);
 
+		if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight"))
+			return false;
 	}
 
-	cudaTextForceKernelsCompletion();
 	return true;
 }
 
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index d356b9f..1dc75ce 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -291,7 +291,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
 				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
 		}
 
-		cudaTextForceKernelsCompletion();
+		// TODO: Consider not synchronizing here, if possible.
+		if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+			return false;
 
 		angles = angles + angleCount;
 		// printf("%f\n", toc(t));
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 1f58516..cf8336c 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -501,8 +501,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
 				dim3 dimGrid(
 				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
 (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -545,17 +545,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaTextForceKernelsCompletion();
+	bool ok = true;
 
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 bool Par3DFP(cudaPitchedPtr D_volumeData,
@@ -726,17 +725,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaTextForceKernelsCompletion();
+	bool ok = true;
 
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok = ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 4f5d134..71b5668 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -387,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
 }
 
 
-bool cudaTextForceKernelsCompletion()
-{
-	cudaError_t returnedCudaError = cudaThreadSynchronize();
-
-	if(returnedCudaError != cudaSuccess) {
-		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
-		return false;
-	}
-
-	return true;
-}
-
 int calcNextPowerOfTwo(int _iValue)
 {
 	int iOutput = 1;
diff --git a/include/astra/cuda/2d/util.h b/include/astra/cuda/2d/util.h
index d504355..0fab9b1 100644
--- a/include/astra/cuda/2d/util.h
+++ b/include/astra/cuda/2d/util.h
@@ -66,13 +66,8 @@ bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dim
 void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims);
 void duplicateProjectionData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims);
 
-
-
-bool cudaTextForceKernelsCompletion();
-
 bool checkCuda(cudaError_t err, const char *msg);
 
-
 float dotProduct2D(float* D_data, unsigned int pitch,
                    unsigned int width, unsigned int height);
 
diff --git a/include/astra/cuda/3d/util3d.h b/include/astra/cuda/3d/util3d.h
index e902a85..e147e42 100644
--- a/include/astra/cuda/3d/util3d.h
+++ b/include/astra/cuda/3d/util3d.h
@@ -60,8 +60,6 @@ bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims);
 cudaArray* allocateProjectionArray(const SDimensions3D& dims);
 cudaArray* allocateVolumeArray(const SDimensions3D& dims);
 
-bool cudaTextForceKernelsCompletion();
-
 float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z);
 
 int calcNextPowerOfTwo(int _iValue);
-- 
cgit v1.2.3