diff options
Diffstat (limited to 'patches')
14 files changed, 3551 insertions, 0 deletions
| diff --git a/patches/astra-toolbox-approximate-projectors/cone_bp.cu b/patches/astra-toolbox-approximate-projectors/cone_bp.cu new file mode 100644 index 0000000..01edcb9 --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/cone_bp.cu @@ -0,0 +1,397 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp +           2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 6; + +static const unsigned int g_anglesPerBlock = 32; +static const unsigned int g_volBlockX = 16; +static const unsigned int g_volBlockY = 32; + +static const unsigned g_MaxAngles = 1024; + +struct DevConeParams { +	float4 fNumU; +	float4 fNumV; +	float4 fDen; +}; + +__constant__ DevConeParams gC_C[g_MaxAngles]; + +#include "rounding.h" + +//__launch_bounds__(32*16, 4) +template<bool FDKWEIGHT, unsigned int ZSIZE> +__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, +                            cudaTextureObject_t tex, +                            int startAngle, int angleOffset, +                            const astraCUDA3d::SDimensions3D dims, +                            float fOutputScale) +{ +	float* volData = (float*)D_volData; + +	int endAngle = startAngle + g_anglesPerBlock; +	if (endAngle > dims.iProjAngles - angleOffset) +		endAngle = dims.iProjAngles - angleOffset; + +	// threadIdx: x = rel x +	//            y = rel y + +	// blockIdx:  x = x + y +	//            y = z + + + +	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; +	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + +	if (X >= dims.iVolX) +		return; +	if (Y >= dims.iVolY) +		return; + +	const int startZ = blockIdx.y * g_volBlockZ; +	const float fX = X - 0.5f*dims.iVolX + 0.5f; +	const float fY = Y - 0.5f*dims.iVolY + 0.5f; +	const float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + +	float Z[ZSIZE]; +	for(int i=0; i < ZSIZE; i++) +		Z[i] = 0.0f; + + +	{ +		float fAngle = startAngle + angleOffset + 0.5f; + +		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) +		{ +			float4 fCu  = gC_C[angle].fNumU; +			float4 fCv  = gC_C[angle].fNumV; +			float4 fCd  = gC_C[angle].fDen; + +			float fUNum = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z; +			float fVNum = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z; +			float fDen  = (FDKWEIGHT ? 1.0f : fCd.w) + fX * fCd.x + fY * fCd.y + fZ * fCd.z; + +			float fU,fV, fr; + +			for (int idx = 0; idx < ZSIZE; idx++) +			{ +				fr = __fdividef(1.0f, fDen); +				fU = fUNum * fr; +				fV = fVNum * fr; + +				float fUf = round(fU) - 0.5f; +				float fVf = round(fV) - 0.5f; + +				textype fU_ = texto(fU); +				textype fV_ = texto(fV); +				textype fUf_ = texto(fUf); +				textype fVf_ = texto(fVf); +				 +				textype fVal0_0; textocheck(fVal0_0, "bp", tex3D<float>(tex, fUf, fAngle, fVf)); +				textype fVal1_0; textocheck(fVal1_0, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf)); +				textype fVal0_1; textocheck(fVal0_1, "bp", tex3D<float>(tex, fUf, fAngle, fVf + 1.0f)); +				textype fVal1_1; textocheck(fVal1_1, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf + 1.0f)); + +				textype fVal0 = interpolate(fVal0_0, fVal0_1, (fV_ - fVf_)); +				textype fVal1 = interpolate(fVal1_0, fVal1_1, (fV_ - fVf_)); +				float fVal = texfrom(interpolate(fVal0, fVal1, (fU_ - fUf_))); + +//				float fVal = tex3D<float>(tex, fU, fAngle, fV); +				Z[idx] += fr*fr*fVal; + +				fUNum += fCu.z; +				fVNum += fCv.z; +				fDen  += fCd.z; +			} +		} +	} + +	int endZ = ZSIZE; +	if (endZ > dims.iVolZ - startZ) +		endZ = dims.iVolZ - startZ; + +	for(int i=0; i < endZ; i++) +		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale; +} //End kernel + + + +// supersampling version +__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale) +{ +	float* volData = (float*)D_volData; + +	int endAngle = startAngle + g_anglesPerBlock; +	if (endAngle > dims.iProjAngles - angleOffset) +		endAngle = dims.iProjAngles - angleOffset; + +	// threadIdx: x = rel x +	//            y = rel y + +	// blockIdx:  x = x + y +    //            y = z + + +	// TO TRY: precompute part of detector intersection formulas in shared mem? +	// TO TRY: inner loop over z, gather ray values in shared mem + +	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; +	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + +	if (X >= dims.iVolX) +		return; +	if (Y >= dims.iVolY) +		return; + +	const int startZ = blockIdx.y * g_volBlockZ; +	int endZ = startZ + g_volBlockZ; +	if (endZ > dims.iVolZ) +		endZ = dims.iVolZ; + +	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; +	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; +	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; +	const float fSubStep = 1.0f/iRaysPerVoxelDim; + +	fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim); + + +	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) +	{ + +		float fVal = 0.0f; +		float fAngle = startAngle + angleOffset + 0.5f; + +		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) +		{ +			float4 fCu  = gC_C[angle].fNumU; +			float4 fCv  = gC_C[angle].fNumV; +			float4 fCd  = gC_C[angle].fDen; + +			float fXs = fX; +			for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) { +			float fYs = fY; +			for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) { +			float fZs = fZ; +			for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) { + +				const float fUNum = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z; +				const float fVNum = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z; +				const float fDen  = fCd.w + fXs * fCd.x + fYs * fCd.y + fZs * fCd.z; + +				const float fr = __fdividef(1.0f, fDen); +				const float fU = fUNum * fr; +				const float fV = fVNum * fr; + +				fVal += tex3D<float>(tex, fU, fAngle, fV) * fr * fr; + +				fZs += fSubStep; +			} +			fYs += fSubStep; +			} +			fXs += fSubStep; +			} + +		} + +		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale; +	} +} + + +bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params) +{ +	DevConeParams *p = new DevConeParams[iProjAngles]; + +	// We need three things in the kernel: +	// projected coordinates of pixels on the detector: + +	// u: || (x-s) v (s-d) || / || u v (s-x) || +	// v: -|| u (x-s) (s-d) || / || u v (s-x) || + +	// ray density weighting factor for the adjoint +	// || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 ) + +	// FDK weighting factor +	// ( || u v s || / || u v (s-x) || ) ^ 2 + +	// Since u and v are ratios with the same denominator, we have +	// a degree of freedom to scale the denominator. We use that to make +	// the square of the denominator equal to the relevant weighting factor. + + +	for (unsigned int i = 0; i < iProjAngles; ++i) { +		Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ); +		Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ); +		Vec3 s(angles[i].fSrcX, angles[i].fSrcY, angles[i].fSrcZ); +		Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ); + + + +		double fScale; +		if (!params.bFDKWeighting) { +			// goal: 1/fDen^2 = || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 ) +			// fDen = ( sqrt(|cross(u,v)|) * || u v (s-x) || ) / || u v (s-d) ||  +			// i.e. scale = sqrt(|cross(u,v)|) * / || u v (s-d) || + + +			// NB: for cross(u,v) we invert the volume scaling (for the voxel +			// size normalization) to get the proper dimensions for +			// the scaling of the adjoint + +			fScale = sqrt(scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm()) / det3(u, v, s-d); +		} else { +			// goal: 1/fDen = || u v s || / || u v (s-x) || +			// fDen = || u v (s-x) || / || u v s || +			// i.e., scale = 1 / || u v s || + +			fScale = 1.0 / det3(u, v, s); +		} + +		p[i].fNumU.w = fScale * det3(s,v,d); +		p[i].fNumU.x = fScale * det3x(v,s-d); +		p[i].fNumU.y = fScale * det3y(v,s-d); +		p[i].fNumU.z = fScale * det3z(v,s-d); +		p[i].fNumV.w = -fScale * det3(s,u,d); +		p[i].fNumV.x = -fScale * det3x(u,s-d); +		p[i].fNumV.y = -fScale * det3y(u,s-d); +		p[i].fNumV.z = -fScale * det3z(u,s-d); +		p[i].fDen.w = fScale * det3(u, v, s); // == 1.0 for FDK +		p[i].fDen.x = -fScale * det3x(u, v); +		p[i].fDen.y = -fScale * det3y(u, v); +		p[i].fDen.z = -fScale * det3z(u, v); +	} + +	// TODO: Check for errors +	cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevConeParams), 0, cudaMemcpyHostToDevice); + +	delete[] p; + +	return true; +} + + +bool ConeBP_Array(cudaPitchedPtr D_volumeData, +                  cudaArray *D_projArray, +                  const SDimensions3D& dims, const SConeProjection* angles, +                  const SProjectorParams3D& params) +{ +	cudaTextureObject_t D_texObj; +	if (!createTextureObject3D(D_projArray, D_texObj)) +		return false; + +	float fOutputScale; +	if (params.bFDKWeighting) { +		// NB: assuming cube voxels here +		fOutputScale = params.fOutputScale / (params.fVolScaleX); +	} else { +		fOutputScale = params.fOutputScale * (params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ); +	} + +	bool ok = true; + +	for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) { +		unsigned int angleCount = g_MaxAngles; +		if (th + angleCount > dims.iProjAngles) +			angleCount = dims.iProjAngles - th; + +		ok = transferConstants(angles, angleCount, params); +		if (!ok) +			break; + +		dim3 dimBlock(g_volBlockX, g_volBlockY); + +		dim3 dimGrid(((dims.iVolX/1+g_volBlockX-1)/(g_volBlockX))*((dims.iVolY/1+1*g_volBlockY-1)/(1*g_volBlockY)), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + +		// timeval t; +		// tic(t); + +		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) { +		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);  +			if (params.bFDKWeighting) { +				if (dims.iVolZ == 1) { +					dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} else { +					dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} +			} else if (params.iRaysPerVoxelDim == 1) { +				if (dims.iVolZ == 1) { +					dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} else { +					dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} +			} else +				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale); +		} + +		// TODO: Consider not synchronizing here, if possible. +		ok = checkCuda(cudaThreadSynchronize(), "cone_bp"); +		if (!ok) +			break; + +		angles = angles + angleCount; +		// printf("%f\n", toc(t)); + +	} + +	cudaDestroyTextureObject(D_texObj); + +	return ok; +} + +bool ConeBP(cudaPitchedPtr D_volumeData, +            cudaPitchedPtr D_projData, +            const SDimensions3D& dims, const SConeProjection* angles, +            const SProjectorParams3D& params) +{ +	// transfer projections to array + +	cudaArray* cuArray = allocateProjectionArray(dims); +	transferProjectionsToArray(D_projData, cuArray, dims); + +	bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles, params); + +	cudaFreeArray(cuArray); + +	return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/cone_fp.cu b/patches/astra-toolbox-approximate-projectors/cone_fp.cu new file mode 100644 index 0000000..f11813c --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/cone_fp.cu @@ -0,0 +1,516 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp +           2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 4; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_SrcZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +// x=0, y=1, z=2 +struct DIR_X { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float c0(float x, float y, float z) const { return x; } +	__device__ float c1(float x, float y, float z) const { return y; } +	__device__ float c2(float x, float y, float z) const { return z; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); } +	__device__ float x(float f0, float f1, float f2) const { return f0; } +	__device__ float y(float f0, float f1, float f2) const { return f1; } +	__device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// y=0, x=1, z=2 +struct DIR_Y { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float c0(float x, float y, float z) const { return y; } +	__device__ float c1(float x, float y, float z) const { return x; } +	__device__ float c2(float x, float y, float z) const { return z; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); } +	__device__ float x(float f0, float f1, float f2) const { return f1; } +	__device__ float y(float f0, float f1, float f2) const { return f0; } +	__device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// z=0, x=1, y=2 +struct DIR_Z { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float c0(float x, float y, float z) const { return z; } +	__device__ float c1(float x, float y, float z) const { return x; } +	__device__ float c2(float x, float y, float z) const { return y; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); } +	__device__ float x(float f0, float f1, float f2) const { return f1; } +	__device__ float y(float f0, float f1, float f2) const { return f2; } +	__device__ float z(float f0, float f1, float f2) const { return f0; } +}; + +struct SCALE_CUBE { +	float fOutputScale; +	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; } +}; + +struct SCALE_NONCUBE { +	float fScale1; +	float fScale2; +	float fOutputScale; +	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; } +}; + + +bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles) +{ +	// transfer angles to constant memory +	float* tmp = new float[iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + +	TRANSFER_TO_CONSTANT(SrcX); +	TRANSFER_TO_CONSTANT(SrcY); +	TRANSFER_TO_CONSTANT(SrcZ); +	TRANSFER_TO_CONSTANT(DetSX); +	TRANSFER_TO_CONSTANT(DetSY); +	TRANSFER_TO_CONSTANT(DetSZ); +	TRANSFER_TO_CONSTANT(DetUX); +	TRANSFER_TO_CONSTANT(DetUY); +	TRANSFER_TO_CONSTANT(DetUZ); +	TRANSFER_TO_CONSTANT(DetVX); +	TRANSFER_TO_CONSTANT(DetVY); +	TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + +	delete[] tmp; + +	return true; +} + + +#include "rounding.h" + +	// threadIdx: x = ??? detector  (u?) +	//            y = relative angle + +	// blockIdx:  x = ??? detector  (u+v?) +    //            y = angle block + +template<class COORD, class SCALE> +__global__ void cone_FP_t(float* D_projData, unsigned int projPitch, +                          cudaTextureObject_t tex, +                          unsigned int startSlice, +                          unsigned int startAngle, unsigned int endAngle, +                          const SDimensions3D dims, +                          SCALE sc) +{ +	COORD c; + +	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; +	if (angle >= endAngle) +		return; + +	const float fSrcX = gC_SrcX[angle]; +	const float fSrcY = gC_SrcY[angle]; +	const float fSrcZ = gC_SrcZ[angle]; +	const float fDetUX = gC_DetUX[angle]; +	const float fDetUY = gC_DetUY[angle]; +	const float fDetUZ = gC_DetUZ[angle]; +	const float fDetVX = gC_DetVX[angle]; +	const float fDetVY = gC_DetVY[angle]; +	const float fDetVZ = gC_DetVZ[angle]; +	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; +	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; +	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + +	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; +	if (detectorU >= dims.iProjU) +		return; +	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; +	int endDetectorV = startDetectorV + g_detBlockV; +	if (endDetectorV > dims.iProjV) +		endDetectorV = dims.iProjV; + +	int endSlice = startSlice + g_blockSlices; +	if (endSlice > c.nSlices(dims)) +		endSlice = c.nSlices(dims); + +	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) +	{ +		/* Trace ray from Src to (detectorU,detectorV) from */ +		/* X = startSlice to X = endSlice                   */ + +		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; +		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; +		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + +		/*        (x)   ( 1)       ( 0) */ +		/* ray:   (y) = (ay) * x + (by) */ +		/*        (z)   (az)       (bz) */ + +		const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); +		const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); +		const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ); +		const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ); + +		const float fDistCorr = sc.scale(a1, a2); + +		float fVal = 0.0f; + +		float f0 = startSlice + 0.5f; +		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; +		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + +		for (int s = startSlice; s < endSlice; ++s) +		{ + +			float f1f = round(f1) - 0.5f; +			float f2f = round(f2) - 0.5f; + +			textype f1_ = texto(f1); +			textype f2_ = texto(f2); +			textype f1f_ = texto(f1f); +			textype f2f_ = texto(f2f); + +			textype fVal0_0; textocheck(fVal0_0, "fp", c.tex(tex, f0, f1f, f2f)); +			textype fVal1_0; textocheck(fVal1_0, "fp", c.tex(tex, f0, f1f + 1.0f, f2f)); +			textype fVal0_1; textocheck(fVal0_1, "fp", c.tex(tex, f0, f1f, f2f + 1.0f)); +			textype fVal1_1; textocheck(fVal1_1, "fp", c.tex(tex, f0, f1f + 1.0f, f2f + 1.0f)); + +			textype fVal0 = interpolate(fVal0_0, fVal0_1, (f2_ - f2f_)); +			textype fVal1 = interpolate(fVal1_0, fVal1_1, (f2_ - f2f_)); +			fVal += texfrom(interpolate(fVal0, fVal1, (f1_ - f1f_))); + +//			fVal += c.tex(tex, f0, f1, f2); +			f0 += 1.0f; +			f1 += a1; +			f2 += a2; +		} + +		fVal *= fDistCorr; + +		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; +	} +} + +template<class COORD> +__global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch, +                             cudaTextureObject_t tex, +                             unsigned int startSlice, +                             unsigned int startAngle, unsigned int endAngle, +                             const SDimensions3D dims, int iRaysPerDetDim, +                             SCALE_NONCUBE sc) +{ +	COORD c; + +	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; +	if (angle >= endAngle) +		return; + +	const float fSrcX = gC_SrcX[angle]; +	const float fSrcY = gC_SrcY[angle]; +	const float fSrcZ = gC_SrcZ[angle]; +	const float fDetUX = gC_DetUX[angle]; +	const float fDetUY = gC_DetUY[angle]; +	const float fDetUZ = gC_DetUZ[angle]; +	const float fDetVX = gC_DetVX[angle]; +	const float fDetVY = gC_DetVY[angle]; +	const float fDetVZ = gC_DetVZ[angle]; +	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; +	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; +	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + +	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; +	if (detectorU >= dims.iProjU) +		return; +	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; +	int endDetectorV = startDetectorV + g_detBlockV; +	if (endDetectorV > dims.iProjV) +		endDetectorV = dims.iProjV; + +	int endSlice = startSlice + g_blockSlices; +	if (endSlice > c.nSlices(dims)) +		endSlice = c.nSlices(dims); + +	const float fSubStep = 1.0f/iRaysPerDetDim; + +	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) +	{ +		/* Trace ray from Src to (detectorU,detectorV) from */ +		/* X = startSlice to X = endSlice                   */ + +		float fV = 0.0f; + +		float fdU = detectorU - 0.5f + 0.5f*fSubStep; +		for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { +		float fdV = detectorV - 0.5f + 0.5f*fSubStep; +		for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { + +		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; +		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; +		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; + +		/*        (x)   ( 1)       ( 0) */ +		/* ray:   (y) = (ay) * x + (by) */ +		/*        (z)   (az)       (bz) */ + +		const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); +		const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); +		const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ); +		const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ); + +		const float fDistCorr = sc.scale(a1, a2); + +		float fVal = 0.0f; + +		float f0 = startSlice + 0.5f; +		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; +		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + +		for (int s = startSlice; s < endSlice; ++s) +		{ +			fVal += c.tex(tex, f0, f1, f2); +			f0 += 1.0f; +			f1 += a1; +			f2 += a2; +		} + +		fVal *= fDistCorr; +		fV += fVal; + +		} +		} + +		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim); +	} +} + + +bool ConeFP_Array_internal(cudaPitchedPtr D_projData, +                  cudaTextureObject_t D_texObj, +                  const SDimensions3D& dims, +                  unsigned int angleCount, const SConeProjection* angles, +                  const SProjectorParams3D& params) +{ +	if (!transferConstants(angles, angleCount)) +		return false; + +	std::list<cudaStream_t> streams; +	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + +	// Run over all angles, grouping them into groups of the same +	// orientation (roughly horizontal vs. roughly vertical). +	// Start a stream of grids for each such group. + +	unsigned int blockStart = 0; +	unsigned int blockEnd = 0; +	int blockDirection = 0; + +	bool cube = true; +	if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001) +		cube = false; +	if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001) +		cube = false; + +	SCALE_CUBE scube; +	scube.fOutputScale = params.fOutputScale * params.fVolScaleX; + +	SCALE_NONCUBE snoncubeX; +	float fS1 = params.fVolScaleY / params.fVolScaleX; +	snoncubeX.fScale1 = fS1 * fS1; +	float fS2 = params.fVolScaleZ / params.fVolScaleX; +	snoncubeX.fScale2 = fS2 * fS2; +	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + +	SCALE_NONCUBE snoncubeY; +	fS1 = params.fVolScaleX / params.fVolScaleY; +	snoncubeY.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleZ / params.fVolScaleY; +	snoncubeY.fScale2 = fS2 * fS2; +	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + +	SCALE_NONCUBE snoncubeZ; +	fS1 = params.fVolScaleX / params.fVolScaleZ; +	snoncubeZ.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleY / params.fVolScaleZ; +	snoncubeZ.fScale2 = fS2 * fS2; +	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + +	// timeval t; +	// tic(t); + +	for (unsigned int a = 0; a <= angleCount; ++a) { +		int dir = -1; +		if (a != angleCount) { +			float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f)); +			float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f)); +			float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f)); + +			if (dX >= dY && dX >= dZ) +				dir = 0; +			else if (dY >= dX && dY >= dZ) +				dir = 1; +			else +				dir = 2; +		} + +		if (a == angleCount || dir != blockDirection) { +			// block done + +			blockEnd = a; +			if (blockStart != blockEnd) { + +				dim3 dimGrid( +				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + +				// TODO: consider limiting number of handle (chaotic) geoms +				//       with many alternating directions +				cudaStream_t stream; +				cudaStreamCreate(&stream); +				streams.push_back(stream); + +				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + +				if (blockDirection == 0) { +					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							if (cube) +								cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); +							else +								cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeX); +						else +							cone_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX); +				} else if (blockDirection == 1) { +					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							if (cube) +								cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); +							else +								cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeY); +						else +							cone_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY); +				} else if (blockDirection == 2) { +					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							if (cube) +								cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); +							else +								cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeZ); +						else +							cone_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ); +				} + +			} + +			blockDirection = dir; +			blockStart = a; +		} +	} + +	bool ok = true; + +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp"); +		cudaStreamDestroy(*iter); +	} + +	// printf("%f\n", toc(t)); + +	return ok; +} + + +bool ConeFP(cudaPitchedPtr D_volumeData, +            cudaPitchedPtr D_projData, +            const SDimensions3D& dims, const SConeProjection* angles, +            const SProjectorParams3D& params) +{ +	// transfer volume to array +	cudaArray* cuArray = allocateVolumeArray(dims); +	transferVolumeToArray(D_volumeData, cuArray, dims); + +	cudaTextureObject_t D_texObj; +	if (!createTextureObject3D(cuArray, D_texObj)) { +		cudaFreeArray(cuArray); +		return false; +	} + +	bool ret; + +	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { +		unsigned int iEndAngle = iAngle + g_MaxAngles; +		if (iEndAngle >= dims.iProjAngles) +			iEndAngle = dims.iProjAngles; + +		cudaPitchedPtr D_subprojData = D_projData; +		D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch; + +		ret = ConeFP_Array_internal(D_subprojData, D_texObj, +		                            dims, iEndAngle - iAngle, angles + iAngle, +		                            params); +		if (!ret) +			break; +	} + +	cudaFreeArray(cuArray); + +	return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/par3d_bp.cu b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu new file mode 100644 index 0000000..7958ac9 --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu @@ -0,0 +1,327 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp +           2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 6; + +static const unsigned int g_anglesPerBlock = 32; +static const unsigned int g_volBlockX = 16; +static const unsigned int g_volBlockY = 32; + +static const unsigned g_MaxAngles = 1024; + +struct DevPar3DParams { +	float4 fNumU; +	float4 fNumV; +}; + +__constant__ DevPar3DParams gC_C[g_MaxAngles]; +__constant__ float gC_scale[g_MaxAngles]; + + +#include "rounding.h" + +template<unsigned int ZSIZE> +__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale) +{ +	float* volData = (float*)D_volData; + +	int endAngle = startAngle + g_anglesPerBlock; +	if (endAngle > dims.iProjAngles - angleOffset) +		endAngle = dims.iProjAngles - angleOffset; + +	// threadIdx: x = rel x +	//            y = rel y + +	// blockIdx:  x = x + y +	//            y = z + + +	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; +	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + +	if (X >= dims.iVolX) +		return; +	if (Y >= dims.iVolY) +		return; + +	const int startZ = blockIdx.y * g_volBlockZ; + +	float fX = X - 0.5f*dims.iVolX + 0.5f; +	float fY = Y - 0.5f*dims.iVolY + 0.5f; +	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + +	float Z[ZSIZE]; +	for(int i=0; i < ZSIZE; i++) +		Z[i] = 0.0f; + +	{ +		float fAngle = startAngle + angleOffset + 0.5f; + +		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) +		{ + +			float4 fCu = gC_C[angle].fNumU; +			float4 fCv = gC_C[angle].fNumV; +			float fS = gC_scale[angle]; + +			float fU = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z; +			float fV = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z; +//			printf("%f %f\n", fU, fV); + +			for (int idx = 0; idx < ZSIZE; ++idx) { +				float fVal; +				textype h5 = texto(0.5f); +				textype fU_ = texto(fU); +				textype fUf_ = texto(floor(fU)); +				float fUf = floor(fU); +				 +				if ((fU - fUf) < 0.5f) { +				    textype fVal1 = texto(tex3D<float>(tex, fUf - 0.5f, fAngle, fV)); +				    textype fVal2 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV)); +				    fVal = texfrom(fVal1 + (fU_ + h5 - fUf_) * (fVal2 - fVal1)); +				} else { +				    textype fVal1 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV)); +				    textype fVal2 = texto(tex3D<float>(tex, fUf + 1.5f, fAngle, fV)); +				    fVal = texfrom(fVal1 + (fU_ - h5 - fUf_) * (fVal2 - fVal1)); +				} + +//				float fVal = tex3D<float>(tex, fU, fAngle, fV); +				Z[idx] += fVal * fS; + +				fU += fCu.z; +				fV += fCv.z; +			} + +		} +	} + +	int endZ = ZSIZE; +	if (endZ > dims.iVolZ - startZ) +		endZ = dims.iVolZ - startZ; + +	for(int i=0; i < endZ; i++) +		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale; +} + +// supersampling version +__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale) +{ +	float* volData = (float*)D_volData; + +	int endAngle = startAngle + g_anglesPerBlock; +	if (endAngle > dims.iProjAngles - angleOffset) +		endAngle = dims.iProjAngles - angleOffset; + +	// threadIdx: x = rel x +	//            y = rel y + +	// blockIdx:  x = x + y +    //            y = z + + +	// TO TRY: precompute part of detector intersection formulas in shared mem? +	// TO TRY: inner loop over z, gather ray values in shared mem + +	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; +	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + +	if (X >= dims.iVolX) +		return; +	if (Y >= dims.iVolY) +		return; + +	const int startZ = blockIdx.y * g_volBlockZ; +	int endZ = startZ + g_volBlockZ; +	if (endZ > dims.iVolZ) +		endZ = dims.iVolZ; + +	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; +	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; +	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + +	const float fSubStep = 1.0f/iRaysPerVoxelDim; + +	fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim); + + +	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) +	{ + +		float fVal = 0.0f; +		float fAngle = startAngle + angleOffset + 0.5f; + +		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) +		{ +			float4 fCu = gC_C[angle].fNumU; +			float4 fCv = gC_C[angle].fNumV; +			float fS = gC_scale[angle]; + +			float fXs = fX; +			for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) { +			float fYs = fY; +			for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) { +			float fZs = fZ; +			for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) { + +				const float fU = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z; +				const float fV = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z; + +				fVal += tex3D<float>(tex, fU, fAngle, fV) * fS; +				fZs += fSubStep; +			} +			fYs += fSubStep; +			} +			fXs += fSubStep; +			} + +		} + +		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale; +	} + +} + +bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params) +{ +	DevPar3DParams *p = new DevPar3DParams[iProjAngles]; +	float *s = new float[iProjAngles]; + +	for (unsigned int i = 0; i < iProjAngles; ++i) { +		Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ); +		Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ); +		Vec3 r(angles[i].fRayX, angles[i].fRayY, angles[i].fRayZ); +		Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ); + +		double fDen = det3(r,u,v); +		p[i].fNumU.x = -det3x(r,v) / fDen; +		p[i].fNumU.y = -det3y(r,v) / fDen; +		p[i].fNumU.z = -det3z(r,v) / fDen; +		p[i].fNumU.w = -det3(r,d,v) / fDen; +		p[i].fNumV.x = det3x(r,u) / fDen; +		p[i].fNumV.y = det3y(r,u) / fDen; +		p[i].fNumV.z = det3z(r,u) / fDen; +		p[i].fNumV.w = det3(r,d,u) / fDen; + +		s[i] = 1.0 / scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm(); +	} + +	cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevPar3DParams), 0, cudaMemcpyHostToDevice); +	cudaMemcpyToSymbol(gC_scale, s, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + +	delete[] p; +	delete[] s; + +	return true; +} + +bool Par3DBP_Array(cudaPitchedPtr D_volumeData, +                   cudaArray *D_projArray, +                   const SDimensions3D& dims, const SPar3DProjection* angles, +                   const SProjectorParams3D& params) +{ +	cudaTextureObject_t D_texObj; +	if (!createTextureObject3D(D_projArray, D_texObj)) +		return false; + +	float fOutputScale = params.fOutputScale * params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ; + +	bool ok = true; + +	for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) { +		unsigned int angleCount = g_MaxAngles; +		if (th + angleCount > dims.iProjAngles) +			angleCount = dims.iProjAngles - th; + +		ok = transferConstants(angles, angleCount, params); +		if (!ok) +			break; + +		dim3 dimBlock(g_volBlockX, g_volBlockY); + +		dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + +		// timeval t; +		// tic(t); + +		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) { +			// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr);  +			if (params.iRaysPerVoxelDim == 1) { +				if (dims.iVolZ == 1) { +					dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} else { +					dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); +				} +			} else +				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale); +		} + +		// TODO: Consider not synchronizing here, if possible. +		ok = checkCuda(cudaThreadSynchronize(), "cone_bp"); +		if (!ok) +			break; + +		angles = angles + angleCount; +		// printf("%f\n", toc(t)); + +	} + +	cudaDestroyTextureObject(D_texObj); + +	return true; +} + +bool Par3DBP(cudaPitchedPtr D_volumeData, +            cudaPitchedPtr D_projData, +            const SDimensions3D& dims, const SPar3DProjection* angles, +            const SProjectorParams3D& params) +{ +	// transfer projections to array + +	cudaArray* cuArray = allocateProjectionArray(dims); +	transferProjectionsToArray(D_projData, cuArray, dims); + +	bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles, params); + +	cudaFreeArray(cuArray); + +	return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/par3d_fp.cu b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu new file mode 100644 index 0000000..075784b --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu @@ -0,0 +1,770 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp +           2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 32; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_RayX[g_MaxAngles]; +__constant__ float gC_RayY[g_MaxAngles]; +__constant__ float gC_RayZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +// x=0, y=1, z=2 +struct DIR_X { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float c0(float x, float y, float z) const { return x; } +	__device__ float c1(float x, float y, float z) const { return y; } +	__device__ float c2(float x, float y, float z) const { return z; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); } +	__device__ float x(float f0, float f1, float f2) const { return f0; } +	__device__ float y(float f0, float f1, float f2) const { return f1; } +	__device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// y=0, x=1, z=2 +struct DIR_Y { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float c0(float x, float y, float z) const { return y; } +	__device__ float c1(float x, float y, float z) const { return x; } +	__device__ float c2(float x, float y, float z) const { return z; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); } +	__device__ float x(float f0, float f1, float f2) const { return f1; } +	__device__ float y(float f0, float f1, float f2) const { return f0; } +	__device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// z=0, x=1, y=2 +struct DIR_Z { +	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; } +	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } +	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; } +	__device__ float c0(float x, float y, float z) const { return z; } +	__device__ float c1(float x, float y, float z) const { return x; } +	__device__ float c2(float x, float y, float z) const { return y; } +	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); } +	__device__ float x(float f0, float f1, float f2) const { return f1; } +	__device__ float y(float f0, float f1, float f2) const { return f2; } +	__device__ float z(float f0, float f1, float f2) const { return f0; } +}; + +struct SCALE_CUBE { +	float fOutputScale; +	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; } +}; + +struct SCALE_NONCUBE { +	float fScale1; +	float fScale2; +	float fOutputScale; +	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; } +}; + +bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles) +{ +	// transfer angles to constant memory +	float* tmp = new float[iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + +	TRANSFER_TO_CONSTANT(RayX); +	TRANSFER_TO_CONSTANT(RayY); +	TRANSFER_TO_CONSTANT(RayZ); +	TRANSFER_TO_CONSTANT(DetSX); +	TRANSFER_TO_CONSTANT(DetSY); +	TRANSFER_TO_CONSTANT(DetSZ); +	TRANSFER_TO_CONSTANT(DetUX); +	TRANSFER_TO_CONSTANT(DetUY); +	TRANSFER_TO_CONSTANT(DetUZ); +	TRANSFER_TO_CONSTANT(DetVX); +	TRANSFER_TO_CONSTANT(DetVY); +	TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + +	delete[] tmp; + +	return true; +} + + +// threadIdx: x = u detector +//            y = relative angle +// blockIdx:  x = u/v detector +//            y = angle block + +#include "rounding.h" + +template<class COORD, class SCALE> +__global__ void par3D_FP_t(float* D_projData, unsigned int projPitch, +                           cudaTextureObject_t tex, +                           unsigned int startSlice, +                           unsigned int startAngle, unsigned int endAngle, +                           const SDimensions3D dims, +                           SCALE sc) +{ +	COORD c; + +	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; +	if (angle >= endAngle) +		return; + +	const float fRayX = gC_RayX[angle]; +	const float fRayY = gC_RayY[angle]; +	const float fRayZ = gC_RayZ[angle]; +	const float fDetUX = gC_DetUX[angle]; +	const float fDetUY = gC_DetUY[angle]; +	const float fDetUZ = gC_DetUZ[angle]; +	const float fDetVX = gC_DetVX[angle]; +	const float fDetVY = gC_DetVY[angle]; +	const float fDetVZ = gC_DetVZ[angle]; +	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; +	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; +	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + +	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float fDistCorr = sc.scale(a1, a2); + + +	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; +	if (detectorU >= dims.iProjU) +		return; +	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; +	int endDetectorV = startDetectorV + g_detBlockV; +	if (endDetectorV > dims.iProjV) +		endDetectorV = dims.iProjV; + +	int endSlice = startSlice + g_blockSlices; +	if (endSlice > c.nSlices(dims)) +		endSlice = c.nSlices(dims); + +	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) +	{ +		/* Trace ray in direction Ray to (detectorU,detectorV) from  */ +		/* X = startSlice to X = endSlice                            */ + +		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; +		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; +		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + +		/*        (x)   ( 1)       ( 0)    */ +		/* ray:   (y) = (ay) * x + (by)    */ +		/*        (z)   (az)       (bz)    */ + +		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); +		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + +		float fVal = 0.0f; + +		float f0 = startSlice + 0.5f; +		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; +		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; +		//printf("%f, %f (%f), %f (%f)\n", f0, f1, a1, f2, a2); // Only f1 non linear + +		for (int s = startSlice; s < endSlice; ++s) +		{ + +			textype h5 = texto(0.5f); +			textype f1_ = texto(f1); +			textype f1f_ = texto(floor(f1)); +			float f1f = floor(f1); +				 +			if ((f1 - f1f) < 0.5f) { +			    textype fVal1 = texto(c.tex(tex, f0, f1f - 0.5f, f2)); +			    textype fVal2 = texto(c.tex(tex, f0, f1f + 0.5f, f2)); +			    fVal += texfrom(fVal1 + (f1_ + h5 - f1f_) * (fVal2 - fVal1)); +//			    fVal += texfrom(__hfma(__hadd(h5,__hsub(f1_, f1f_)), __hsub(fVal2, fVal1), fVal1)); +			} else { +			    textype fVal1 = texto(c.tex(tex, f0, f1f + 0.5f, f2)); +			    textype fVal2 = texto(c.tex(tex, f0, f1f + 1.5f, f2)); +			    fVal += texfrom(fVal1 + (f1_ - h5 - f1f_) * (fVal2 - fVal1)); +			} + +//			fVal += c.tex(tex, f0, f1, f2); +			f0 += 1.0f; +			f1 += a1; +			f2 += a2; +		} + +		fVal *= fDistCorr; + +		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; +	} +} + +// Supersampling version +template<class COORD> +__global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch, +                              cudaTextureObject_t tex, +                              unsigned int startSlice, +                              unsigned int startAngle, unsigned int endAngle, +                              const SDimensions3D dims, int iRaysPerDetDim, +                              SCALE_NONCUBE sc) +{ +	COORD c; + +	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; +	if (angle >= endAngle) +		return; + +	const float fRayX = gC_RayX[angle]; +	const float fRayY = gC_RayY[angle]; +	const float fRayZ = gC_RayZ[angle]; +	const float fDetUX = gC_DetUX[angle]; +	const float fDetUY = gC_DetUY[angle]; +	const float fDetUZ = gC_DetUZ[angle]; +	const float fDetVX = gC_DetVX[angle]; +	const float fDetVY = gC_DetVY[angle]; +	const float fDetVZ = gC_DetVZ[angle]; +	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; +	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; +	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + +	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float fDistCorr = sc.scale(a1, a2); + + +	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; +	if (detectorU >= dims.iProjU) +		return; +	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; +	int endDetectorV = startDetectorV + g_detBlockV; +	if (endDetectorV > dims.iProjV) +		endDetectorV = dims.iProjV; + +	int endSlice = startSlice + g_blockSlices; +	if (endSlice > c.nSlices(dims)) +		endSlice = c.nSlices(dims); + +	const float fSubStep = 1.0f/iRaysPerDetDim; + +	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) +	{ + +		float fV = 0.0f; + +		float fdU = detectorU - 0.5f + 0.5f*fSubStep; +		for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { +		float fdV = detectorV - 0.5f + 0.5f*fSubStep; +		for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { + +		/* Trace ray in direction Ray to (detectorU,detectorV) from  */ +		/* X = startSlice to X = endSlice                            */ + +		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; +		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; +		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; + +		/*        (x)   ( 1)       ( 0)    */ +		/* ray:   (y) = (ay) * x + (by)    */ +		/*        (z)   (az)       (bz)    */ + +		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); +		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + + +		float fVal = 0.0f; + +		float f0 = startSlice + 0.5f; +		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; +		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + +		for (int s = startSlice; s < endSlice; ++s) +		{ +			fVal += c.tex(tex, f0, f1, f2); + +			f0 += 1.0f; +			f1 += a1; +			f2 += a2; +		} + +		fV += fVal; + +		} +		} + +		fV *= fDistCorr; +		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim); +	} +} + + +__device__ float dirWeights(float fX, float fN) { +	if (fX <= -0.5f) // outside image on left +		return 0.0f; +	if (fX <= 0.5f) // half outside image on left +		return (fX + 0.5f) * (fX + 0.5f); +	if (fX <= fN - 0.5f) { // inside image +		float t = fX + 0.5f - floorf(fX + 0.5f); +		return t*t + (1-t)*(1-t); +	} +	if (fX <= fN + 0.5f) // half outside image on right +		return (fN + 0.5f - fX) * (fN + 0.5f - fX); +	return 0.0f; // outside image on right +} + +template<class COORD> +__global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch, +                                  unsigned int startSlice, +                                  unsigned int startAngle, unsigned int endAngle, +                                  const SDimensions3D dims, +                                  SCALE_NONCUBE sc) +{ +	COORD c; + +	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; +	if (angle >= endAngle) +		return; + +	const float fRayX = gC_RayX[angle]; +	const float fRayY = gC_RayY[angle]; +	const float fRayZ = gC_RayZ[angle]; +	const float fDetUX = gC_DetUX[angle]; +	const float fDetUY = gC_DetUY[angle]; +	const float fDetUZ = gC_DetUZ[angle]; +	const float fDetVX = gC_DetVX[angle]; +	const float fDetVY = gC_DetVY[angle]; +	const float fDetVZ = gC_DetVZ[angle]; +	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; +	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; +	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + +	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); +	const float fDistCorr = sc.scale(a1, a2); + + +	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; +	if (detectorU >= dims.iProjU) +		return; +	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; +	int endDetectorV = startDetectorV + g_detBlockV; +	if (endDetectorV > dims.iProjV) +		endDetectorV = dims.iProjV; + +	int endSlice = startSlice + g_blockSlices; +	if (endSlice > c.nSlices(dims)) +		endSlice = c.nSlices(dims); + +	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) +	{ +		/* Trace ray in direction Ray to (detectorU,detectorV) from  */ +		/* X = startSlice to X = endSlice                            */ + +		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; +		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; +		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + +		/*        (x)   ( 1)       ( 0)    */ +		/* ray:   (y) = (ay) * x + (by)    */ +		/*        (z)   (az)       (bz)    */ + +		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); +		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + +		float fVal = 0.0f; + +		float f0 = startSlice + 0.5f; +		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; +		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + +		for (int s = startSlice; s < endSlice; ++s) +		{ +			fVal += dirWeights(f1, c.nDim1(dims)) * dirWeights(f2, c.nDim2(dims)); +			f0 += 1.0f; +			f1 += a1; +			f2 += a2; +		} + +		fVal *= fDistCorr * fDistCorr; +		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; +	} +} + +// Supersampling version +// TODO + + +bool Par3DFP_Array_internal(cudaPitchedPtr D_projData, +                   cudaTextureObject_t D_texObj, +                   const SDimensions3D& dims, +                   unsigned int angleCount, const SPar3DProjection* angles, +                   const SProjectorParams3D& params) +{ +	if (!transferConstants(angles, angleCount)) +		return false; + +	std::list<cudaStream_t> streams; +	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + +	// Run over all angles, grouping them into groups of the same +	// orientation (roughly horizontal vs. roughly vertical). +	// Start a stream of grids for each such group. + +	unsigned int blockStart = 0; +	unsigned int blockEnd = 0; +	int blockDirection = 0; + +	bool cube = true; +	if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001) +		cube = false; +	if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001) +		cube = false; + +	SCALE_CUBE scube; +	scube.fOutputScale = params.fOutputScale * params.fVolScaleX; + +	SCALE_NONCUBE snoncubeX; +	float fS1 = params.fVolScaleY / params.fVolScaleX; +	snoncubeX.fScale1 = fS1 * fS1; +	float fS2 = params.fVolScaleZ / params.fVolScaleX; +	snoncubeX.fScale2 = fS2 * fS2; +	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + +	SCALE_NONCUBE snoncubeY; +	fS1 = params.fVolScaleX / params.fVolScaleY; +	snoncubeY.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleY / params.fVolScaleY; +	snoncubeY.fScale2 = fS2 * fS2; +	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + +	SCALE_NONCUBE snoncubeZ; +	fS1 = params.fVolScaleX / params.fVolScaleZ; +	snoncubeZ.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleY / params.fVolScaleZ; +	snoncubeZ.fScale2 = fS2 * fS2; +	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + +	// timeval t; +	// tic(t); + +	for (unsigned int a = 0; a <= angleCount; ++a) { +		int dir = -1; +		if (a != angleCount) { +			float dX = fabsf(angles[a].fRayX); +			float dY = fabsf(angles[a].fRayY); +			float dZ = fabsf(angles[a].fRayZ); + +			if (dX >= dY && dX >= dZ) +				dir = 0; +			else if (dY >= dX && dY >= dZ) +				dir = 1; +			else +				dir = 2; +		} + +		if (a == angleCount || dir != blockDirection) { +			// block done + +			blockEnd = a; +			if (blockStart != blockEnd) { + +				dim3 dimGrid( +				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); +				// TODO: consider limiting number of handle (chaotic) geoms +				//       with many alternating directions +				cudaStream_t stream; +				cudaStreamCreate(&stream); +				streams.push_back(stream); + +				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + +				if (blockDirection == 0) { +					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +								if (cube) +										par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); +								else +										par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeX); +						else +							par3D_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX); +				} else if (blockDirection == 1) { +					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +								if (cube) +										par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, scube); +								else +										par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeY); +						else +							par3D_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY); +				} else if (blockDirection == 2) { +					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +								if (cube) +										par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, scube); +								else +										par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeZ); +						else +							par3D_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ); +				} + +			} + +			blockDirection = dir; +			blockStart = a; +		} +	} + +	bool ok = true; + +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp"); +		cudaStreamDestroy(*iter); +	} + +	// printf("%f\n", toc(t)); + +	return ok; +} + +bool Par3DFP(cudaPitchedPtr D_volumeData, +             cudaPitchedPtr D_projData, +             const SDimensions3D& dims, const SPar3DProjection* angles, +             const SProjectorParams3D& params) +{ + +	// transfer volume to array +	cudaArray* cuArray = allocateVolumeArray(dims); +	transferVolumeToArray(D_volumeData, cuArray, dims); + +	cudaTextureObject_t D_texObj; +	if (!createTextureObject3D(cuArray, D_texObj)) { +		cudaFreeArray(cuArray); +		return false; +	} + +	bool ret; + +	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { +		unsigned int iEndAngle = iAngle + g_MaxAngles; +		if (iEndAngle >= dims.iProjAngles) +			iEndAngle = dims.iProjAngles; + +		cudaPitchedPtr D_subprojData = D_projData; +		D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch; + +		ret = Par3DFP_Array_internal(D_subprojData, D_texObj, +		                             dims, iEndAngle - iAngle, angles + iAngle, +		                             params); +		if (!ret) +			break; +	} + +	cudaFreeArray(cuArray); + +	cudaDestroyTextureObject(D_texObj); + +	return ret; +} + + + +bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, +                    cudaPitchedPtr D_projData, +                    const SDimensions3D& dims, const SPar3DProjection* angles, +                    const SProjectorParams3D& params) +{ +	// transfer angles to constant memory +	float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + +	TRANSFER_TO_CONSTANT(RayX); +	TRANSFER_TO_CONSTANT(RayY); +	TRANSFER_TO_CONSTANT(RayZ); +	TRANSFER_TO_CONSTANT(DetSX); +	TRANSFER_TO_CONSTANT(DetSY); +	TRANSFER_TO_CONSTANT(DetSZ); +	TRANSFER_TO_CONSTANT(DetUX); +	TRANSFER_TO_CONSTANT(DetUY); +	TRANSFER_TO_CONSTANT(DetUZ); +	TRANSFER_TO_CONSTANT(DetVX); +	TRANSFER_TO_CONSTANT(DetVY); +	TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + +	delete[] tmp; + +	std::list<cudaStream_t> streams; +	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + +	// Run over all angles, grouping them into groups of the same +	// orientation (roughly horizontal vs. roughly vertical). +	// Start a stream of grids for each such group. + +	unsigned int blockStart = 0; +	unsigned int blockEnd = 0; +	int blockDirection = 0; + +	SCALE_NONCUBE snoncubeX; +	float fS1 = params.fVolScaleY / params.fVolScaleX; +	snoncubeX.fScale1 = fS1 * fS1; +	float fS2 = params.fVolScaleZ / params.fVolScaleX; +	snoncubeX.fScale2 = fS2 * fS2; +	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + +	SCALE_NONCUBE snoncubeY; +	fS1 = params.fVolScaleX / params.fVolScaleY; +	snoncubeY.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleY / params.fVolScaleY; +	snoncubeY.fScale2 = fS2 * fS2; +	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + +	SCALE_NONCUBE snoncubeZ; +	fS1 = params.fVolScaleX / params.fVolScaleZ; +	snoncubeZ.fScale1 = fS1 * fS1; +	fS2 = params.fVolScaleY / params.fVolScaleZ; +	snoncubeZ.fScale2 = fS2 * fS2; +	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + + +	// timeval t; +	// tic(t); + +	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { +		int dir; +		if (a != dims.iProjAngles) { +			float dX = fabsf(angles[a].fRayX); +			float dY = fabsf(angles[a].fRayY); +			float dZ = fabsf(angles[a].fRayZ); + +			if (dX >= dY && dX >= dZ) +				dir = 0; +			else if (dY >= dX && dY >= dZ) +				dir = 1; +			else +				dir = 2; +		} + +		if (a == dims.iProjAngles || dir != blockDirection) { +			// block done + +			blockEnd = a; +			if (blockStart != blockEnd) { + +				dim3 dimGrid( +				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); +				// TODO: check if we can't immediately +				//       destroy the stream after use +				cudaStream_t stream; +				cudaStreamCreate(&stream); +				streams.push_back(stream); + +				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + +				if (blockDirection == 0) { +					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							par3D_FP_SumSqW_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeX); +						else +#if 0 +							par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else +							assert(false); +#endif +				} else if (blockDirection == 1) { +					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							par3D_FP_SumSqW_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeY); +						else +#if 0 +							par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else +							assert(false); +#endif +				} else if (blockDirection == 2) { +					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) +						if (params.iRaysPerDetDim == 1) +							par3D_FP_SumSqW_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeZ); +						else +#if 0 +							par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else +							assert(false); +#endif +				} + +			} + +			blockDirection = dir; +			blockStart = a; +		} +	} + +	bool ok = true; + +	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { +		ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW"); +		cudaStreamDestroy(*iter); +	} + +	// printf("%f\n", toc(t)); + +	return ok; +} + + + + + + + +} diff --git a/patches/astra-toolbox-approximate-projectors/rounding.h b/patches/astra-toolbox-approximate-projectors/rounding.h new file mode 100644 index 0000000..c1cbffb --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/rounding.h @@ -0,0 +1,50 @@ +#include <cuda_fp16.h> + +#define precision 8 +#define approximate_interpolation + +#ifdef approximate_interpolation +# ifdef precision +#  if precision == 16 +#   define texto(v) __float2half(v) +#   define texfrom(v) __half2float(v) +#   define textype half +#   define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +#   define textocheck(var,msg,val) var=texto(val); +#  else +#   define precision_mult ((1<<precision)-1) +#   define texto(v) ((unsigned char)(precision_mult*v)) +#   define texfrom(v) (1.f * v / precision_mult) +#   define textype unsigned +#   define interpolate(v0, v1, pos) (v0 + ((unsigned)(256 * pos)) * (v1 - v0) / 256) +#   define textocheck(var, msg, val) \ +	if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \ +	var=texto(val); +#  endif +# else +#  define texto(v) (v) +#  define texfrom(v) (v) +#  define textype float +#  define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +#  define textocheck(var,msg,val) var=texto(val); +# endif +#else +# ifdef precision +#  if precision == 16 +#   define texto(v) __half2float(__float2half(v)) +#   define textocheck(var,msg,val) var=texto(val); +#  else +#   define precision_mult ((1<<precision)-1) +#   define texto(v) (floor(precision_mult*v)/precision_mult) +#   define textocheck(var, msg, val) \ +	if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \ +	var=texto(val); +#  endif +# else +#  define texto(v) (v) +#  define textocheck(var,msg,val) var=texto(val); +# endif +# define texfrom(v) (v) +# define textype float +# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +#endif diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt new file mode 100644 index 0000000..893abc5 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt @@ -0,0 +1,141 @@ +#   Copyright 2018 Edoardo Pasca +#cmake_minimum_required (VERSION 3.0) + +project(RGL_core) +#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py + +# The version number. + +set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE) + +# conda orchestrated build +message("CIL_VERSION ${CIL_VERSION}") +#include (GenerateExportHeader) + +## Build the regularisers package as a library +message("Creating Regularisers as a shared library") + +set(CMAKE_BUILD_TYPE "Release") + +set (EXTRA_LIBRARIES "") +if(WIN32) +  set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp") +  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib") +  message("library lib: ${LIBRARY_LIB}") +elseif(APPLE) +  set (FLAGS "-DCCPiReconstructionIterative_EXPORTS ") +elseif(UNIX) +   set (FLAGS "-O3 -funsigned-char -Wall  -Wl,--no-undefined  -DCCPiReconstructionIterative_EXPORTS") +   set(EXTRA_LIBRARIES "m") +endif() + +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + +  set (CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}") +  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}") +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") + + +## Build the regularisers package as a library +message("Adding regularisers as a shared library") + +add_library(cilreg SHARED +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c +	    ) +target_link_libraries(cilreg ${OpenMP_EXE_LINKER_FLAGS} ${EXTRA_LIBRARIES}) +#set (CMAKE_SHARED_LINKER_FLAGS "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") +#target_link_options(cilreg PUBLIC  +include_directories(cilreg PUBLIC +                      ${LIBRARY_INC}/include +					  ${CMAKE_CURRENT_SOURCE_DIR} +		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ ) + +## Install + +if (UNIX) +message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +install(TARGETS cilreg +	LIBRARY DESTINATION lib +	CONFIGURATIONS ${CMAKE_BUILD_TYPE} +	) +elseif(WIN32) +message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") +  install(TARGETS cilreg +	RUNTIME DESTINATION bin +	ARCHIVE DESTINATION lib +	CONFIGURATIONS ${CMAKE_BUILD_TYPE} +	) +endif() + + + +# GPU Regularisers +if (BUILD_CUDA) +    find_package(CUDA) +    if (CUDA_FOUND) +      set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES") +      message("CUDA FLAGS ${CUDA_NVCC_FLAGS}") +      CUDA_ADD_LIBRARY(cilregcuda SHARED +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu +      ) +      if (UNIX) +        message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +        install(TARGETS cilregcuda +        LIBRARY DESTINATION lib +        CONFIGURATIONS ${CMAKE_BUILD_TYPE} +        ) +      elseif(WIN32) +        message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") +        install(TARGETS cilregcuda +        RUNTIME DESTINATION bin +        ARCHIVE DESTINATION lib +        CONFIGURATIONS ${CMAKE_BUILD_TYPE} +        ) +      endif() +    else() +      message("CUDA NOT FOUND") +    endif() +endif() + +if (${BUILD_MATLAB_WRAPPER}) +  if (WIN32) +        install(TARGETS cilreg DESTINATION ${MATLAB_DEST}) +        if (CUDA_FOUND) +            install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST}) +        endif() +  endif() +endif() diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c new file mode 100644 index 0000000..91fd746 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c @@ -0,0 +1,266 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2019 Daniil Kazantsev + * Copyright 2019 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "FGP_TV_core.h" + +/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ) +{ +    int ll; +    long j, DimTotal; +    float re, re1; +    re = 0.0f; re1 = 0.0f; +    float tk = 1.0f; +    float tkp1 =1.0f; +    int count = 0; + +    if (dimZ <= 1) { +        /*2D case */ +        float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; +        DimTotal = (long)(dimX*dimY); + +        if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); +        P1 = calloc(DimTotal, sizeof(float)); +        P2 = calloc(DimTotal, sizeof(float)); +        P1_prev = calloc(DimTotal, sizeof(float)); +        P2_prev = calloc(DimTotal, sizeof(float)); +        R1 = calloc(DimTotal, sizeof(float)); +        R2 = calloc(DimTotal, sizeof(float)); + +        /* begin iterations */ +        for(ll=0; ll<iterationsNumb; ll++) { + +            if ((epsil != 0.0f)  && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); +            /* computing the gradient of the objective function */ +            Obj_func2D(Input, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); + +            /* apply nonnegativity */ +            if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + +            /*Taking a step towards minus of the gradient*/ +            Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); + +            /* projection step */ +            Proj_func2D(P1, P2, methodTV, DimTotal); + +            /*updating R and t*/ +            tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; +            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal); + +            /*storing old values*/ +            copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), 1l); +            copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), 1l); +            tk = tkp1; + +            /* check early stopping criteria */ +            if ((epsil != 0.0f)  && (ll % 5 == 0)) { +                re = 0.0f; re1 = 0.0f; +                for(j=0; j<DimTotal; j++) +                { +                    re += powf(Output[j] - Output_prev[j],2); +                    re1 += powf(Output[j],2); +                } +                re = sqrtf(re)/sqrtf(re1); +                if (re < epsil)  count++; +                if (count > 3) break; +            } +        } +        if (epsil != 0.0f) free(Output_prev); +        free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); +    } +    else { +        /*3D case*/ +        float *P1=NULL, *P2=NULL, *P3=NULL, *R1=NULL, *R2=NULL, *R3=NULL; +        DimTotal = (long)dimX*(long)dimY*(long)dimZ; + +        P1 = calloc(DimTotal, sizeof(float)); +        P2 = calloc(DimTotal, sizeof(float)); +        P3 = calloc(DimTotal, sizeof(float)); + +        R1 = calloc(DimTotal, sizeof(float)); +        R2 = calloc(DimTotal, sizeof(float)); +        R3 = calloc(DimTotal, sizeof(float)); + +        /* begin iterations */ +        for(ll=0; ll<iterationsNumb; ll++) { +            /* computing the gradient of the objective function */ +            re = Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, nonneg, (long)(dimX), (long)(dimY), (long)(dimZ)); + +            tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; +	    All_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, tkp1, tk, (long)(dimX), (long)(dimY), (long)(dimZ)); + +            // calculate norm - stopping rules +            if ((epsil != 0.0f)  && (ll % 5 == 0)) { +                // stop if the norm residual is less than the tolerance EPS +                if (re < epsil)  count++; +                if (count > 3) break; +            } + +            tk = tkp1; +        } + +        free(P1); free(P2); free(P3); free(R1); free(R2); free(R3); +//	printf("TV iters %i (of %i)\n", ll, iterationsNumb); +    } + +    /*adding info into info_vector */ +    infovector[0] = (float)(ll);  /*iterations number (if stopped earlier based on tolerance)*/ +    infovector[1] = re;  /* reached tolerance */ + +    return 0; +} + +float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY) +{ +    float val1, val2; +    long i,j,index; +#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2) +    for(j=0; j<dimY; j++) { +        for(i=0; i<dimX; i++) { +            index = j*dimX+i; +            /* boundary conditions  */ +            if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];} +            if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];} +            D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2); +        }} +    return *D; +} +float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda,  long dimX, long dimY) +{ +    float val1, val2, multip; +    long i,j,index; +    multip = (1.0f/(8.0f*lambda)); +#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2) +    for(j=0; j<dimY; j++) { +        for(i=0; i<dimX; i++) { +            index = j*dimX+i; +            /* boundary conditions */ +            if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)]; +            if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i]; +            P1[index] = R1[index] + multip*val1; +            P2[index] = R2[index] + multip*val2; +        }} +    return 1; +} +float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal) +{ +    long i; +    float multip; +    multip = ((tk-1.0f)/tkp1); +#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i) +    for(i=0; i<DimTotal; i++) { +        R1[i] = P1[i] + multip*(P1[i] - P1_old[i]); +        R2[i] = P2[i] + multip*(P2[i] - P2_old[i]); +    } +    return 1; +} + +    // too much threads poisoning caches. there is a sweet spot +#define n_threads 16 + +/* 3D-case related Functions */ +/*****************************************************************/ +float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ) +{ +    float D_new; +    float val1, val2, val3; +    float re = 0.0f, re1 = 0.0f; +    long i,j,k,index; + +#pragma omp parallel for reduction(+ : re, re1) shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,D_new) num_threads(n_threads) +    for(k=0; k<dimZ; k++) { +        for(j=0; j<dimY; j++) { +            for(i=0; i<dimX; i++) { +                index = (dimX*dimY)*k + j*dimX+i; +                /* boundary conditions */ +                val1 = (i == 0)?0.f:R1[(dimX*dimY)*k + j*dimX + (i-1)]; +                val2 = (j == 0)?0.f:R2[(dimX*dimY)*k + (j-1)*dimX + i]; +                val3 = (k == 0)?0.f:R3[(dimX*dimY)*(k-1) + j*dimX + i]; + +                D_new = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); +		if ((nonneg == 1)&&(D_new < 0.0f)) D_new = 0.0f; + +		re += powf(D_new - D[index], 2); +		re1 += powf(D_new, 2); +		D[index] = D_new; +            }}} +	 +    return sqrtf(re)/sqrtf(re1); +} + +float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ) +{ +    float val1, val2, val3, denom, sq_denom; +    float P1_new, P2_new, P3_new; +    long i,j,k, index; +    float multip = (1.0f/(26.0f*lambda)); +    float multip2 = ((tk-1.0f)/tkp1); + +#pragma omp parallel for shared(P1_old,P2_old,P3_old,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,denom,sq_denom,multip,multip2,P1_new,P2_new,P3_new) num_threads(n_threads) +    for(k=0; k<dimZ; k++) { +        for(j=0; j<dimY; j++) { +            for(i=0; i<dimX; i++) { +                index = (dimX*dimY)*k + j*dimX+i; + +                /* boundary conditions */ +                val1 = (i == dimX-1)?0.0f: D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)]; +                val2 = (j == dimY-1)?0.0f: D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i]; +                val3 = (k == dimZ-1)?0.0f: D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i]; + +                P1_new = R1[index] + multip*val1; +                P2_new = R2[index] + multip*val2; +                P3_new = R3[index] + multip*val3; + +		denom = powf(P1_new,2) + powf(P2_new,2) + powf(P3_new,2); +		if (denom > 1.0f) { +		    sq_denom = 1.0f/sqrtf(denom); +		    P1_new *= sq_denom; +		    P2_new *= sq_denom; +		    P3_new *= sq_denom; +        	} + +		R1[index] = P1_new + multip2*(P1_new - P1_old[index]); +		R2[index] = P2_new + multip2*(P2_new - P2_old[index]); +		R3[index] = P3_new + multip2*(P3_new - P3_old[index]); +		 +		P1_old[index] = P1_new; +		P2_old[index] = P2_new; +		P3_old[index] = P3_new; +            }}} + + +    return 1; +} diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h new file mode 100644 index 0000000..cda0f40 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h @@ -0,0 +1,62 @@ +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//#include <matrix.h> +#include <math.h> +#include <stdlib.h> +#include <memory.h> +#include <stdio.h> +#include "omp.h" +#include "utils.h" +#include "CCPiDefines.h" + +/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +#ifdef __cplusplus +extern "C" { +#endif +CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); + +CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); +CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); +CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal); + +CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ); +CCPI_EXPORT float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ); + +#ifdef __cplusplus +} +#endif diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt new file mode 100644 index 0000000..1a3b37a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt @@ -0,0 +1,169 @@ +#   Copyright 2018 Edoardo Pasca +#cmake_minimum_required (VERSION 3.0) + +project(RGL_core) +#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py + +# The version number. + +set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE) + +# conda orchestrated build +message("CIL_VERSION ${CIL_VERSION}") +#include (GenerateExportHeader) + +## Build the regularisers package as a library +message("Creating Regularisers as a shared library") + +find_package(GLIB2 REQUIRED) +find_package(OpenMP REQUIRED) +if (OPENMP_FOUND) +    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") +    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}") +message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}") +message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") + +set(CMAKE_BUILD_TYPE "Release") + +if(WIN32) +  set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp") +  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") +  set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") +  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib") + +  set (EXTRA_LIBRARIES) + +  message("library lib: ${LIBRARY_LIB}") + +elseif(UNIX) +   set (FLAGS "-O2 -funsigned-char -Wall  -Wl,--no-undefined  -DCCPiReconstructionIterative_EXPORTS ") +   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") +   set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + +   set (EXTRA_LIBRARIES +		"gomp" +		"m" +		) + +endif() +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") + +## Build the regularisers package as a library +message("Adding regularisers as a shared library") + +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC") +add_library(cilreg SHARED +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c +            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c +	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c +	    ) +target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} ) +include_directories(cilreg PUBLIC +                      ${GLIB2_INCLUDE_DIR} +                      ${LIBRARY_INC}/include +					  ${CMAKE_CURRENT_SOURCE_DIR} +		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ +		              ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/  ) + +## Install + +if (UNIX) +message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +install(TARGETS cilreg +	LIBRARY DESTINATION lib +	CONFIGURATIONS ${CMAKE_BUILD_TYPE} +	) +elseif(WIN32) +message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") +  install(TARGETS cilreg +	RUNTIME DESTINATION bin +	ARCHIVE DESTINATION lib +	CONFIGURATIONS ${CMAKE_BUILD_TYPE} +	) +endif() + + + +# GPU Regularisers +if (BUILD_CUDA) +    find_package(CUDA) +    if (CUDA_FOUND) +      set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES -allow-unsupported-compiler") +      message("CUDA FLAGS ${CUDA_NVCC_FLAGS}") +      CUDA_ADD_LIBRARY(cilregcuda SHARED +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu +        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu +      ) +      if (UNIX) +        message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +        install(TARGETS cilregcuda +        LIBRARY DESTINATION lib +        CONFIGURATIONS ${CMAKE_BUILD_TYPE} +        ) +      elseif(WIN32) +        message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") +        install(TARGETS cilregcuda +        RUNTIME DESTINATION bin +        ARCHIVE DESTINATION lib +        CONFIGURATIONS ${CMAKE_BUILD_TYPE} +        ) +      endif() +    else() +      message("CUDA NOT FOUND") +    endif() +endif() + +if (${BUILD_MATLAB_WRAPPER}) +  if (WIN32) +        install(TARGETS cilreg DESTINATION ${MATLAB_DEST}) +        if (CUDA_FOUND) +            install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST}) +        endif() +  endif() +endif() diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c new file mode 100755 index 0000000..cbce96a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c @@ -0,0 +1,668 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef _Float16 floatxx;	// Large arrays, allways float16 if we go mixed-precision. +//typedef _Float16 floatyy;	// Small arrays which we can do both ways. +typedef float floatyy; + +typedef struct { +    int offY, stepY, copY; +    floatxx *Input, *u, *qx, *qy, *gradx, *grady, *div; +    floatyy *div0, *udiff0; +    floatyy *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->div); +     +    free(ctx->div0); +    free(ctx->udiff0); + +    free(ctx->gradxdiff);  +    free(ctx->gradydiff); +    free(ctx->ubarx); +    free(ctx->ubary); +    free(ctx->udiff); + +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); +    long DimRow = (long)(dimX * padZ); +    long DimCell = (long)(padZ); + +    // Auxiliar vectors +    ctx->Input = memalign(64, Dim1Total * sizeof(floatxx)); +    ctx->u = memalign(64, Dim1Total * sizeof(floatxx)); +    ctx->qx = memalign(64, DimTotal * sizeof(floatxx)); +    ctx->qy = memalign(64, DimTotal * sizeof(floatxx)); +    ctx->gradx = memalign(64, DimTotal * sizeof(floatxx)); +    ctx->grady = memalign(64, DimTotal * sizeof(floatxx)); +    ctx->div = memalign(64, Dim1Total * sizeof(floatxx)); + +    ctx->div0 = memalign(64, DimRow * sizeof(floatyy)); +    ctx->udiff0 = memalign(64, DimRow * sizeof(floatyy)); + +    ctx->gradxdiff = memalign(64, DimCell * sizeof(floatyy)); +    ctx->gradydiff = memalign(64, DimCell * sizeof(floatyy)); +    ctx->ubarx = memalign(64, DimCell * sizeof(floatyy)); +    ctx->ubary = memalign(64, DimCell * sizeof(floatyy)); +    ctx->udiff = memalign(64, DimCell * sizeof(floatyy)); +     +    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); +    memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); +    memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); +    memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    floatxx *Input = ctx->Input; +    floatxx *u = ctx->u; +    floatxx *qx = ctx->qx; +    floatxx *qy = ctx->qy; +    floatxx *gradx = ctx->gradx; +    floatxx *grady = ctx->grady; +    floatxx *div = ctx->div; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual1 = 0.0f; +    float resdual2 = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    floatyy qxdiff; +    floatyy qydiff; +    floatyy divdiff; +    floatyy *gradxdiff = ctx->gradxdiff; +    floatyy *gradydiff = ctx->gradydiff; +    floatyy *ubarx = ctx->ubarx; +    floatyy *ubary = ctx->ubary; +    floatyy *udiff = ctx->udiff; + +    floatyy *udiff0 = ctx->udiff0; +    floatyy *div0 = ctx->div0; + + +    j = 0; { +#       define TNV_LOOP_FIRST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_FIRST_J +    } + + + +    for(int j = 1; j < (copY - 1); j++) { +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +    } + +    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { +        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { +            for(int j2 = 0; j2 < BLOCK; j2 ++) { +                j = j1 + j2; +                for(int i2 = 0; i2 < BLOCK; i2++) { +                    i = i1 + i2; +                     +                    if (i == (dimX - 1)) break; +                    if (j == (copY - 1)) { j2 = BLOCK; break; } +#           include "TNV_core_loop.h" +                }    +            } +        } // i + +    } + +    for(int j = 1; j < (copY - 1); j++) { +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +    } + + + +    for (j = copY - 1; j < stepY; j++) { +#       define TNV_LOOP_LAST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_LAST_J +    } + + + +    ctx->resprimal = resprimal; +    ctx->resdual = resdual1 + resdual2; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = dimZ; +//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); +    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    int started = 0; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = (ctx0->stepY - 1) * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    floatyy divdiff = ctx->div0[l] - ctx->div[l]; +                    floatyy udiff = ctx->udiff0[l]; + +                    ctx->div[l] -= ctx0->qy[l + m]; +                    ctx0->div[m + l + dimX*padZ] = ctx->div[l]; +                    ctx0->u[m + l + dimX*padZ] = ctx->u[l]; +                     +                    divdiff += ctx0->qy[l + m]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    floatyy divdiff = ctx->div0[l] - ctx->div[l]; +                    floatyy udiff = ctx->udiff0[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +//        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +             +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            if (started) { +                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); +            } else { +                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +                if (!err) err = hw_sched_wait_task(sched); +                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +            } +        } else { +            started = 1; +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +//    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h new file mode 100644 index 0000000..aa050a4 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h @@ -0,0 +1,47 @@ +#include <math.h> +#include <stdlib.h> +#include <memory.h> +#include <stdio.h> +#include "omp.h" +#include "utils.h" +#include "CCPiDefines.h" + +#define fTiny 0.00000001f +#define fLarge 100000000.0f +#define INFNORM -1 + +#define MAX(i,j) ((i)<(j) ? (j):(i)) +#define MIN(i,j) ((i)<(j) ? (i):(j)) + +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" { +#endif +CCPI_EXPORT float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ); + +/*float PDHG(float *A, float *B, float tau, float sigma, float theta, float lambda, int p, int q, int r, float tol, int maxIter, int d_c, int d_w, int d_h);*/ +CCPI_EXPORT float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ); +CCPI_EXPORT float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ); +CCPI_EXPORT float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ); +CCPI_EXPORT float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ); +#ifdef __cplusplus +} +#endif
\ No newline at end of file diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h new file mode 100644 index 0000000..86299cd --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h @@ -0,0 +1,107 @@ +    { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +            floatxx *__restrict u_next = u + l + padZ; +            floatxx *__restrict u_current = u + l; +            floatxx *__restrict u_next_row = u + l + m; + + +            floatxx *__restrict qx_current = qx + l; +            floatxx *__restrict qy_current = qy + l; +            floatxx *__restrict qx_prev = qx + l - padZ; +            floatxx *__restrict qy_prev = qy + l - m; + + +//  __assume(padZ%16==0); + +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads +                udiff[k] = u[l + k] - u_upd; // cache 1w +                u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J +                udiff0[l + k] = udiff[k]; +                div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I +                floatyy gradx_upd = 0; +#else +                floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads +                floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J +                floatyy grady_upd = 0; +#else +                floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads +                floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + +                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w +                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w +                gradx[l + k] = gradx_upd; // 1 write +                grady[l + k] = grady_upd; // 1 write +                 +                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w +                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + +                floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read +                floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < padZ; k++) { +                floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r +                floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r +                floatyy gx_upd = vx * t[0] + vy * t[1]; +                floatyy gy_upd = vx * t[1] + vy * t[2]; + +                qxdiff = sigma * (ubarx[k] - gx_upd); +                qydiff = sigma * (ubary[k] - gy_upd); +                 +                qx_current[k] += qxdiff; // write 1 +                qy_current[k] += qydiff; // write 1 + +                unorm += (udiff[k] * udiff[k]); +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                floatyy div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I +                div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J +                div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I +                div_upd += qx_current[k];  +#endif +#ifndef TNV_LOOP_LAST_J +                div_upd += qy_current[k];  +#endif + +                divdiff = div[l + k] - div_upd;  // 1 read +                div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J +                resprimal += fabs(divtau * udiff[k] + divdiff);  +#endif + +                    // We need to have two independent accumulators to allow gcc-autovectorization +                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r +                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r +                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); +            } +    } +    
\ No newline at end of file diff --git a/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch new file mode 100644 index 0000000..cc53b1d --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch @@ -0,0 +1,12 @@ +diff -dPNur CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c +--- CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c	2021-12-17 12:17:12.000000000 +0100 ++++ CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c	2022-09-06 18:52:32.460225737 +0200 +@@ -104,7 +104,7 @@ +     else { +         /*3D case*/ +         float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; +-        DimTotal = (long)(dimX*dimY*dimZ); ++        DimTotal = (long)dimX*(long)dimY*(long)dimZ; +  +         if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); +         P1 = calloc(DimTotal, sizeof(float)); diff --git a/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch new file mode 100644 index 0000000..09b5f51 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch @@ -0,0 +1,19 @@ +--- a/src/Core/regularisers_CPU/utils.c ++++ b/src/Core/regularisers_CPU/utils.c +@@ -176,6 +176,7 @@ float Proj_func2D(float *P1, float *P2, int methTV, long DimTotal) +     } +     return 1; + } ++ + /*3D Projection onto convex set for P (called in PD_TV, FGP_TV, FGP_dTV methods)*/ + float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) + { +@@ -183,7 +184,7 @@ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) +     long i; +     if (methTV == 0) { +         /* isotropic TV*/ +-#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom) ++#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,denom,sq_denom) +         for(i=0; i<DimTotal; i++) { +             denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2); +             if (denom > 1.0f) { | 
