Refactor rtcensus into separate algorithm components

6587787a · Nicolas Pope · a2851649 · 6587787a · 6587787a · 6587787a
Commit 6587787a authored 5 years ago by Nicolas Pope
--- a/cv-node/CMakeLists.txt
+++ b/cv-node/CMakeLists.txt
@@ -88,7 +88,12 @@ if (LIBSGM_FOUND)
 endif (LIBSGM_FOUND)

 if (CUDA_FOUND)
-	list(APPEND CVNODESRC "src/algorithms/opencv_cuda_bm.cpp" "src/algorithms/opencv_cuda_bp.cpp" "src/algorithms/rtcensus.cu")
+	list(APPEND CVNODESRC
+		"src/algorithms/opencv_cuda_bm.cpp"
+		"src/algorithms/opencv_cuda_bp.cpp"
+		"src/algorithms/rtcensus.cu"
+		"src/algorithms/consistency.cu"
+		"src/algorithms/sparse_census.cu")
 endif (CUDA_FOUND)

 add_executable(cv-node ${CVNODESRC})

--- a/cv-node/include/ftl/cuda_algorithms.hpp
+++ b/cv-node/include/ftl/cuda_algorithms.hpp
+#ifndef _FTL_CUDA_ALGORITHMS_HPP_
+#define _FTL_CUDA_ALGORITHMS_HPP_
+
+#include <ftl/cuda_common.hpp>
+
+namespace ftl {
+namespace cuda {
+
+	void consistency(const TextureObject<float> &dl, const TextureObject<float> &dr,
+			TextureObject<float> &disp);
+			
+	void sparse_census(const TextureObject<uchar4> &l, const TextureObject<uchar4> &r,
+			TextureObject<uint2> &cl, TextureObject<uint2> &cr);
+
+}
+}
+
+#endif // _FTL_CUDA_ALGORITHMS_HPP_
+
--- a/cv-node/include/ftl/cuda_common.hpp
+++ b/cv-node/include/ftl/cuda_common.hpp
@@ -19,12 +19,14 @@ class TextureObject {
 	TextureObject() : texobj_(0), ptr_(nullptr) {};
 	TextureObject(const cv::cuda::PtrStepSz<T> &d);
 	TextureObject(T *ptr, int pitch, int width, int height);
-	TextureObject(int width, int height);
+	TextureObject(size_t width, size_t height);
 	TextureObject(const TextureObject &t);
 	~TextureObject();
 	
-	int getPitch();
+	int pitch() const { return pitch_; }
 	T *devicePtr() { return ptr_; };
+	int width() const { return width_; }
+	int height() const { return height_; }
 	cudaTextureObject_t cudaTexture() const { return texobj_; }
 	__device__ inline T tex2D(int u, int v) { return ::tex2D<T>(texobj_, u, v); }
 	__device__ inline T tex2D(float u, float v) { return ::tex2D<T>(texobj_, u, v); }
@@ -41,7 +43,7 @@ class TextureObject {
 	
 	private:
 	cudaTextureObject_t texobj_;
-	int pitch_;
+	size_t pitch_;
 	int width_;
 	int height_;
 	T *ptr_;
@@ -109,8 +111,8 @@ TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
 }

 template <typename T>
-TextureObject<T>::TextureObject(int width, int height) {
-	cudaMallocPitch(&ptr_,&pitch_,width*sizeof(T),height);
+TextureObject<T>::TextureObject(size_t width, size_t height) {
+	cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height);

 	cudaResourceDesc resDesc;
 	memset(&resDesc, 0, sizeof(resDesc));

--- a/cv-node/src/algorithms/consistency.cu
+++ b/cv-node/src/algorithms/consistency.cu
+#include <ftl/cuda_common.hpp>
+
+#define CONSISTENCY_THRESHOLD	1.0f
+
+/*
+ * Check for consistency between the LR and RL disparity maps, only selecting
+ * those that are similar. Otherwise it sets the disparity to NAN.
+ */
+__global__ void consistency_kernel(cudaTextureObject_t d_sub_l,
+		cudaTextureObject_t d_sub_r, float *disp, int w, int h, int pitch) {
+
+	// TODO This doesn't work at either edge (+-max_disparities)
+	for (STRIDE_Y(v,h)) {
+	for (STRIDE_X(u,w)) {
+		float a = (int)tex2D<float>(d_sub_l, u, v);
+		if (u-a < 0) {
+			disp[v*pitch+u] = NAN; // TODO Check
+			continue;
+		}
+		
+		auto b = tex2D<float>(d_sub_r, u-a, v);
+		
+		if (abs(a-b) <= CONSISTENCY_THRESHOLD) disp[v*pitch+u] = abs((a+b)/2);
+		else disp[v*pitch+u] = NAN;
+	}
+	}
+
+}
+
+namespace ftl {
+namespace cuda {
+	void consistency(const TextureObject<float> &dl, const TextureObject<float> &dr,
+			TextureObject<float> &disp) {
+		dim3 grid(1,1,1);
+    	dim3 threads(128, 1, 1);
+    	grid.x = cv::cuda::device::divUp(disp.width(), 128);
+		grid.y = cv::cuda::device::divUp(disp.height(), 11);
+		consistency_kernel<<<grid, threads>>>(
+			dl.cudaTexture(),
+			dr.cudaTexture(),
+			disp.devicePtr(),
+			disp.width(),
+			disp.height(),
+			disp.pitch() / sizeof(float));
+		cudaSafeCall( cudaGetLastError() );
+	}
+}
+}
+
--- a/cv-node/src/algorithms/rtcensus.cu
+++ b/cv-node/src/algorithms/rtcensus.cu
@@ -12,6 +12,7 @@
 */
 
 #include <ftl/cuda_common.hpp>
+#include <ftl/cuda_algorithms.hpp>

 using namespace cv::cuda;
 using namespace cv;
@@ -22,35 +23,12 @@ using namespace cv;
 #define RADIUS2 2
 #define ROWSperTHREAD 1

-#define XHI(P1,P2) ((P1 <= P2) ? 0 : 1)
-
 namespace ftl {
 namespace gpu {

 // --- SUPPORT -----------------------------------------------------------------


-
-/*
- * Sparse 16x16 census (so 8x8) creating a 64bit mask
- * (14) & (15), based upon (9)
- */
-__device__ uint64_t sparse_census(cudaTextureObject_t tex, int u, int v) {
-	uint64_t r = 0;
-
-	unsigned char t = tex2D<uchar4>(tex, u,v).z;
-
-	for (int m=-7; m<=7; m+=2) {
-		//auto start_ix = (v + m)*w + u;
-		for (int n=-7; n<=7; n+=2) {
-			r <<= 1;
-			r |= XHI(t, tex2D<uchar4>(tex, u+n, v+m).z);
-		}
-	}
-
-	return r;
-}
-
 /*
 * Parabolic interpolation between matched disparities either side.
 * Results in subpixel disparity. (20).
@@ -63,30 +41,6 @@ __device__ float fit_parabola(size_t pi, uint16_t p, uint16_t pl, uint16_t pr) {

 // --- KERNELS -----------------------------------------------------------------

-/*
- * Calculate census mask for left and right images together.
- */
-__global__ void census_kernel(cudaTextureObject_t l, cudaTextureObject_t r,
-		int w, int h, uint64_t *censusL, uint64_t *censusR,
-		size_t pL, size_t pR) {	
-	
-	int u = (blockIdx.x * BLOCK_W + threadIdx.x + RADIUS);
-	int v_start = blockIdx.y * ROWSperTHREAD + RADIUS;
-	int v_end = v_start + ROWSperTHREAD;
-	
-	if (v_end+RADIUS >= h) v_end = h-RADIUS;
-	if (u+RADIUS >= w) return;
-	
-	for (int v=v_start; v<v_end; v++) {
-		//int ix = (u + v*pL);
-		uint64_t cenL = sparse_census(l, u, v);
-		uint64_t cenR = sparse_census(r, u, v);
-		
-		censusL[(u + v*pL)] = cenL;
-		censusR[(u + v*pR)] = cenR;
-	}
-}
-
 /* Convert vector uint2 (32bit x2) into a single uint64_t */
 __forceinline__ __device__ uint64_t uint2asull (uint2 a) {
 	uint64_t res;
@@ -218,31 +172,6 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
 	}
 }

-/*
- * Check for consistency between the LR and RL disparity maps, only selecting
- * those that are similar. Otherwise it sets the disparity to NAN.
- */
-__global__ void consistency_kernel(cudaTextureObject_t d_sub_l,
-		cudaTextureObject_t d_sub_r, float *disp, int w, int h, int pitch) {
-
-	// TODO This doesn't work at either edge (+-max_disparities)
-	for (STRIDE_Y(v,h)) {
-	for (STRIDE_X(u,w)) {
-		float a = (int)tex2D<float>(d_sub_l, u, v);
-		if (u-a < 0) {
-			//disp[v*pitch+u] = a;
-			continue;
-		}
-		
-		auto b = tex2D<float>(d_sub_r, u-a, v);
-		
-		if (abs(a-b) <= 1.0) disp[v*pitch+u] = abs((a+b)/2);
-		else disp[v*pitch+u] = NAN;
-	}
-	}
-
-}
-


 template <typename T>
@@ -394,68 +323,37 @@ ftl::cuda::TextureObject<float> prevDisp;
 ftl::cuda::TextureObject<uchar4> prevImage;

 void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) {
-	dim3 grid(1,1,1);
-    dim3 threads(BLOCK_W, 1, 1);
-
-	grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS, BLOCK_W);
-	grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS, ROWSperTHREAD);
-	
-	// TODO, reduce allocations
-	uint64_t *censusL;
-	uint64_t *censusR;
-	size_t pitchL;
-	size_t pitchR;
-
-	float *disp_l;
-	float *disp_r;
-	size_t pitchDL;
-	size_t pitchDR;
-
-	float *disp_raw;
-	size_t pitchD;
-	
-	cudaSafeCall( cudaMallocPitch(&censusL, &pitchL, l.cols*sizeof(uint64_t), l.rows) );
-	cudaSafeCall( cudaMallocPitch(&censusR, &pitchR, r.cols*sizeof(uint64_t), r.rows) );
-	
-	//cudaMemset(census, 0, sizeof(uint64_t)*l.cols*l.rows*2);
-	cudaSafeCall( cudaMallocPitch(&disp_l, &pitchDL, sizeof(float)*l.cols, l.rows) );
-	cudaSafeCall( cudaMallocPitch(&disp_r, &pitchDR, sizeof(float)*l.cols, l.rows) );
-
-	cudaSafeCall( cudaMallocPitch(&disp_raw, &pitchD, sizeof(float)*l.cols, l.rows) );
-	
-	cudaTextureDesc texDesc;
-	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.readMode = cudaReadModeElementType;
-  
+	// Make all the required texture steps
+	// TODO Could reduce re-allocations by caching these
 	ftl::cuda::TextureObject<uchar4> texLeft(l);
 	ftl::cuda::TextureObject<uchar4> texRight(r);
-
-	//size_t smem_size = (2 * l.cols * l.rows) * sizeof(uint64_t);
-	
-	// Calculate L and R census
-	census_kernel<<<grid, threads>>>(texLeft.cudaTexture(), texRight.cudaTexture(), l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t));
-	cudaSafeCall( cudaGetLastError() );
-	
-	//cudaSafeCall( cudaDeviceSynchronize() );
-  
-	ftl::cuda::TextureObject<uint2> censusTexLeft((uint2*)censusL, pitchL, l.cols, l.rows);
-	ftl::cuda::TextureObject<uint2> censusTexRight((uint2*)censusR, pitchR, r.cols, r.rows);
+	ftl::cuda::TextureObject<uint2> censusTexLeft(l.cols, l.rows);
+	ftl::cuda::TextureObject<uint2> censusTexRight(r.cols, r.rows);
+	ftl::cuda::TextureObject<float> dispTexLeft(l.cols, l.rows);
+	ftl::cuda::TextureObject<float> dispTexRight(r.cols, r.rows);
+	ftl::cuda::TextureObject<float> dispTex(r.cols, r.rows);
 	
+	// Calculate the census for left and right
+	ftl::cuda::sparse_census(texLeft, texRight, censusTexLeft, censusTexRight);
+
+	dim3 grid(1,1,1);
+    dim3 threads(BLOCK_W, 1, 1);
 	grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W);
 	grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD);
 	
 	// Calculate L and R disparities
-	disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), num_disp);
+	disp_kernel<<<grid, threads>>>(
+		dispTexLeft.devicePtr(), dispTexRight.devicePtr(),
+		dispTexLeft.pitch()/sizeof(float), dispTexRight.pitch()/sizeof(float),
+		l.cols, l.rows,
+		censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(),
+		num_disp);
 	cudaSafeCall( cudaGetLastError() );
-
-	ftl::cuda::TextureObject<float> dispTexLeft(disp_l, pitchDL, l.cols, l.rows);
-	ftl::cuda::TextureObject<float> dispTexRight(disp_r, pitchDR, r.cols, r.rows);
 	
 	// Check consistency between L and R disparities.
-	consistency_kernel<<<grid, threads>>>(dispTexLeft.cudaTexture(), dispTexRight.cudaTexture(), disp_raw, l.cols, l.rows, pitchD/sizeof(float));
-	cudaSafeCall( cudaGetLastError() );
+	consistency(dispTexLeft, dispTexRight, dispTex);

-	ftl::cuda::TextureObject<float> dispTex(disp_raw, pitchD, r.cols, r.rows);
+	

 	grid.x = 4;
 	grid.y = l.rows;
@@ -480,11 +378,6 @@ void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const
 	dispTexLeft.free();
 	dispTexRight.free();
 	dispTex.free();
-	
-	cudaFree(disp_r);
-	cudaFree(disp_l);
-	cudaFree(censusL);
-	cudaFree(censusR);
 }

 };

--- a/cv-node/src/algorithms/sparse_census.cu
+++ b/cv-node/src/algorithms/sparse_census.cu
+/*
+ * Author: Nicolas Pope and Sebastian Hahta (2019)
+ * Implementation of algorithm presented in article(s):
+ *
+ * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable
+ *     for embedded real-time systems
+ * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based
+ *     Stereo Matching Algorithm on Embedded and Multi-Core Hardware
+ *
+ * Equation numbering uses [1] unless otherwise stated
+ *
+ */
+
+#include <ftl/cuda_common.hpp>
+
+#define XHI(P1,P2) ((P1 <= P2) ? 0 : 1)
+
+/*
+ * Sparse 16x16 census (so 8x8) creating a 64bit mask
+ * (14) & (15), based upon (9)
+ */
+__device__ uint64_t _sparse_census(cudaTextureObject_t tex, int u, int v) {
+	uint64_t r = 0;
+
+	unsigned char t = tex2D<uchar4>(tex, u,v).z;
+
+	for (int m=-7; m<=7; m+=2) {
+		//auto start_ix = (v + m)*w + u;
+		for (int n=-7; n<=7; n+=2) {
+			r <<= 1;
+			r |= XHI(t, tex2D<uchar4>(tex, u+n, v+m).z);
+		}
+	}
+
+	return r;
+}
+
+/*
+ * Calculate census mask for left and right images together.
+ */
+__global__ void census_kernel(cudaTextureObject_t l, cudaTextureObject_t r,
+		int w, int h, uint64_t *censusL, uint64_t *censusR,
+		size_t pL, size_t pR) {	
+	
+	//if (v_end+RADIUS >= h) v_end = h-RADIUS;
+	//if (u+RADIUS >= w) return;
+	
+	for (STRIDE_Y(v,h)) {
+	for (STRIDE_X(u,w)) {
+		//int ix = (u + v*pL);
+		uint64_t cenL = _sparse_census(l, u, v);
+		uint64_t cenR = _sparse_census(r, u, v);
+		
+		censusL[(u + v*pL)] = cenL;
+		censusR[(u + v*pR)] = cenR;
+	}
+	}
+}
+
+namespace ftl {
+namespace cuda {
+	void sparse_census(const TextureObject<uchar4> &l, const TextureObject<uchar4> &r,
+			TextureObject<uint2> &cl, TextureObject<uint2> &cr) {
+		dim3 grid(1,1,1);
+    	dim3 threads(128, 1, 1);
+    	grid.x = cv::cuda::device::divUp(l.width(), 128);
+		grid.y = cv::cuda::device::divUp(l.height(), 11);
+		
+		
+		census_kernel<<<grid, threads>>>(
+			l.cudaTexture(), r.cudaTexture(),
+			l.width(), l.height(),
+			(uint64_t*)cl.devicePtr(), (uint64_t*)cr.devicePtr(),
+			cl.pitch()/sizeof(uint64_t),
+			cr.pitch()/sizeof(uint64_t));
+		cudaSafeCall( cudaGetLastError() );
+	}
+}
+}
+
+
--- a/cv-node/src/algorithms/tex_filter.cu
+++ b/cv-node/src/algorithms/tex_filter.cu
+#include <ftl/cuda_common.hpp>
+
+#define FILTER_WINDOW 11
+#define FILTER_WINDOW_R	5
+
+__global__ void texture_filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d,
+		ftl::cuda::TextureObject<float> f, int num_disp, int thresh) { // Thresh = -5000000
+
+	float disp = tex2D<float>(d,u,v);
+	int neigh_sq = 0;
+	int neigh_sum = 0;
+
+	for (STRIDE_Y(v,h)) {
+	for (STRIDE_X(u,w)) {
+		for (int m=-FILTER_WINDOW_R; m<=FILTER_WINDOW_R; m++) {
+			for (int n=-FILTER_WINDOW_R; n<=FILTER_WINDOW_R; n++) {
+				uchar4 neigh = tex2D<uchar4>(t, u+n, v+m);
+				neigh_sq += neigh*neigh;
+				neigh_sum += neigh;
+			}	
+		}
+	}
+	}
+
+	// Texture map filtering
+	int tm = (neigh_sq / (FILTER_WINDOW*FILTER_WINDOW)) -
+			((neigh_sum*neigh_sum) / (FILTER_WINDOW*FILTER_WINDOW));
+
+	if (tm < thesh) {
+		f(u,v) = disp;
+	} else {
+		f(u,v) = NAN;
+	}
+}
+
+namespace ftl {
+namespace cuda {
+	void texture_filter(const TextureObject<uchar4> &t, const TextureObject<float> &d,
+			TextureObject<float> &f, int num_disp, int thresh) {
+		dim3 grid(1,1,1);
+    	dim3 threads(128, 1, 1);
+    	grid.x = cv::cuda::device::divUp(disp.width(), 128);
+		grid.y = cv::cuda::device::divUp(disp.height(), 1);
+		texture_filter_kernel<<<grid, threads>>>
+			t.cudaTexture(),
+			d.cudaTexture(),
+			f,
+			num_disp,
+			thresh);
+		cudaSafeCall( cudaGetLastError() );
+	}
+}
+}
+