From 3e5100ba621dfb624f0c33a2a8bab31cb4f5dedc Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Thu, 28 Mar 2019 10:33:21 +0200
Subject: [PATCH] Add initial files for sgm rtcensus

---
 .../include/ftl/algorithms/rtcensus_sgm.hpp   |  49 +++++
 cv-node/src/algorithms/rtcensus_sgm.cpp       | 103 +++++++++
 cv-node/src/algorithms/rtcensus_sgm.cu        | 205 ++++++++++++++++++
 3 files changed, 357 insertions(+)
 create mode 100644 cv-node/include/ftl/algorithms/rtcensus_sgm.hpp
 create mode 100644 cv-node/src/algorithms/rtcensus_sgm.cpp
 create mode 100644 cv-node/src/algorithms/rtcensus_sgm.cu

diff --git a/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp b/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp
new file mode 100644
index 000000000..83344afff
--- /dev/null
+++ b/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp
@@ -0,0 +1,49 @@
+#ifndef _FTL_ALGORITHMS_RTCENSUSSGM_HPP_
+#define _FTL_ALGORITHMS_RTCENSUSSGM_HPP_
+
+#include <opencv2/core.hpp>
+#include <opencv2/opencv.hpp>
+#include <ftl/disparity.hpp>
+#include <nlohmann/json.hpp>
+
+#if defined HAVE_CUDA
+#include <opencv2/core/cuda.hpp>
+#endif
+
+namespace ftl {
+namespace algorithms {
+class RTCensusSGM : public ftl::Disparity {
+	public:
+	RTCensusSGM(nlohmann::json &config);
+	
+	void setGamma(float gamma) { gamma_ = gamma; }
+	void setTau(float tau) { tau_ = tau; }
+	
+	void compute(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp);
+
+	static inline Disparity *create(nlohmann::json &config) { return new RTCensusSGM(config); }
+	
+	private:
+	float gamma_;
+	float tau_;
+	bool use_cuda_;
+	bool alternate_;
+	
+	#if defined HAVE_CUDA
+	cv::cuda::GpuMat disp_;
+	cv::cuda::GpuMat filtered_;
+	cv::cuda::GpuMat left_;
+	cv::cuda::GpuMat left2_;
+	cv::cuda::GpuMat right_;
+	#endif
+	
+	private:
+	#if defined HAVE_CUDA
+	void computeCUDA(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp);
+	#endif
+};
+};
+};
+
+#endif // _FTL_ALGORITHMS_RTCENSUSSGM_HPP_
+
diff --git a/cv-node/src/algorithms/rtcensus_sgm.cpp b/cv-node/src/algorithms/rtcensus_sgm.cpp
new file mode 100644
index 000000000..d6b93a08b
--- /dev/null
+++ b/cv-node/src/algorithms/rtcensus_sgm.cpp
@@ -0,0 +1,103 @@
+/* Created by Nicolas Pope and Sebastian Hahta
+ *
+ * Implementation of algorithm presented in article(s):
+ *
+ * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable
+ *     for embedded real-time systems
+ * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based
+ *     Stereo Matching Algorithm on Embedded and Multi-Core Hardware
+ * [3] Humenberger, Engelke, Kubinger: A Census-Based Stereo Vision Algorithm Using Modified Semi-Global Matching
+ *     and Plane Fitting to Improve Matching Quality.
+ *
+ * Equation numbering uses [1] unless otherwise stated
+ */
+
+
+#include <ftl/algorithms/rtcensus_sgm.hpp>
+#include <vector>
+#include <tuple>
+#include <bitset>
+#include <cmath>
+#include <glog/logging.h>
+
+using ftl::algorithms::RTCensusSGM;
+using std::vector;
+using cv::Mat;
+using cv::Point;
+using cv::Size;
+using std::tuple;
+using std::get;
+using std::make_tuple;
+using std::bitset;
+
+static ftl::Disparity::Register rtcensus("rtcensus_sgm", RTCensusSGM::create);
+
+RTCensusSGM::RTCensusSGM(nlohmann::json &config)
+	:	Disparity(config),
+		gamma_(0.0f),
+		tau_(0.0f),
+		use_cuda_(config.value("use_cuda",true)),
+		alternate_(false) {}
+
+/*
+ * Choose the implementation and perform disparity calculation.
+ */
+void RTCensusSGM::compute(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) {
+	#if defined HAVE_CUDA
+	if (use_cuda_) {
+		computeCUDA(l,r,disp);
+	} else {
+		//computeCPU(l,r,disp);
+		LOG(ERROR) << "RTCensus SGM requires CUDA";
+	}
+	#else // !HAVE_CUDA
+	//computeCPU(l,r,disp);
+	LOG(ERROR) << "RTCensus SGM requires CUDA";
+	#endif
+}
+
+#if defined HAVE_CUDA
+
+using namespace cv::cuda;
+using namespace cv;
+
+#include <vector_types.h>
+
+namespace ftl { namespace gpu {
+void rtcensus_sgm_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &s=0);
+}}
+
+void RTCensusSGM::computeCUDA(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) {
+	// Initialise gpu memory here because we need image size
+	if (disp_.empty()) disp_ = cuda::GpuMat(l.size(), CV_32FC1);
+	if (left_.empty()) left_ = cuda::GpuMat(l.size(), CV_8UC4);
+	if (left2_.empty()) left2_ = cuda::GpuMat(l.size(), CV_8UC4);
+	if (right_.empty()) right_ = cuda::GpuMat(l.size(), CV_8UC4);
+	
+	Mat lhsv, rhsv;
+	cv::cvtColor(l, lhsv,  COLOR_BGR2HSV);
+	cv::cvtColor(r, rhsv, COLOR_BGR2HSV);
+	int from_to[] = {0,0,1,1,2,2,-1,3};
+	Mat hsval(lhsv.size(), CV_8UC4);
+	Mat hsvar(rhsv.size(), CV_8UC4);
+	mixChannels(&lhsv, 1, &hsval, 1, from_to, 4);
+	mixChannels(&rhsv, 1, &hsvar, 1, from_to, 4);
+	
+	// Send images to GPU
+	if (alternate_) left_.upload(hsval);
+	else left2_.upload(hsval);
+	right_.upload(hsvar);
+
+	auto start = std::chrono::high_resolution_clock::now();
+	ftl::gpu::rtcensus_sgm_call((alternate_)?left_:left2_, right_, disp_, max_disp_);
+	std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+	LOG(INFO) << "CUDA rtcensus_sgm in " << elapsed.count() << "s";
+	
+	alternate_ = !alternate_;
+	
+	// Read disparity from GPU
+	disp_.download(disp);
+}
+
+#endif
+
diff --git a/cv-node/src/algorithms/rtcensus_sgm.cu b/cv-node/src/algorithms/rtcensus_sgm.cu
new file mode 100644
index 000000000..94b36ed1c
--- /dev/null
+++ b/cv-node/src/algorithms/rtcensus_sgm.cu
@@ -0,0 +1,205 @@
+/*
+ * Author: Nicolas Pope and Sebastian Hahta (2019)
+ * Implementation of algorithm presented in article(s):
+ *
+ * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable
+ *     for embedded real-time systems
+ * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based
+ *     Stereo Matching Algorithm on Embedded and Multi-Core Hardware
+ * [3] Humenberger, Engelke, Kubinger: A Census-Based Stereo Vision Algorithm Using Modified Semi-Global Matching
+ *     and Plane Fitting to Improve Matching Quality.
+ *
+ * Equation numbering uses [1] unless otherwise stated
+ *
+ */
+ 
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_algorithms.hpp>
+
+using namespace cv::cuda;
+using namespace cv;
+
+
+#define BLOCK_W 60
+#define RADIUS 7
+#define RADIUS2 2
+#define ROWSperTHREAD 1
+
+namespace ftl {
+namespace gpu {
+
+// --- SUPPORT -----------------------------------------------------------------
+
+
+/*
+ * Parabolic interpolation between matched disparities either side.
+ * Results in subpixel disparity. (20).
+ */
+__device__ static float fit_parabola(size_t pi, uint16_t p, uint16_t pl, uint16_t pr) {
+	float a = pr - pl;
+	float b = 2 * (2 * p - pl - pr);
+	return static_cast<float>(pi) + (a / b);
+}
+
+// --- KERNELS -----------------------------------------------------------------
+
+/* Convert vector uint2 (32bit x2) into a single uint64_t */
+__forceinline__ __device__ static uint64_t uint2asull (uint2 a) {
+	uint64_t res;
+	asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y));
+	return res;
+}
+
+/*
+ * Generate left and right disparity images from census data. (18)(19)(25)
+ */
+__global__ static void disp_kernel(float *disp_l, float *disp_r,
+		int pitchL, int pitchR,
+		size_t width, size_t height,
+		cudaTextureObject_t censusL, cudaTextureObject_t censusR,
+		size_t ds) {	
+	//extern __shared__ uint64_t cache[];
+
+	const int gamma = 35;
+	
+	int u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2;
+	int v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2;
+	int v_end = v_start + ROWSperTHREAD;
+	int maxdisp = ds;
+	
+	// Local cache
+	uint64_t l_cache_l1[5][5];
+	uint64_t l_cache_l2[5][5];
+	
+	if (v_end >= height) v_end = height;
+	if (u+maxdisp >= width) maxdisp = width-u;
+	
+	for (int v=v_start; v<v_end; v++) {
+		for (int m=-2; m<=2; m++) {
+			for (int n=-2; n<=2; n++) {
+				l_cache_l2[m+2][n+2] = uint2asull(tex2D<uint2>(censusL,u+n,v+m));
+				l_cache_l1[m+2][n+2] = uint2asull(tex2D<uint2>(censusR,u+n,v+m));
+			}
+		}
+		
+		uint16_t last_ham1 = 65535;
+		uint16_t last_ham2 = 65535;
+		uint16_t min_disp1 = 65535;
+		uint16_t min_disp2 = 65535;
+		uint16_t min_disp1b = 65535;
+		uint16_t min_disp2b = 65535;
+		uint16_t min_before1 = 0;
+		uint16_t min_before2 = 0;
+		uint16_t min_after1 = 0;
+		uint16_t min_after2 = 0;
+		int dix1 = 0;
+		int dix2 = 0;
+		
+		// (19)
+		for (int d=0; d<maxdisp; d++) {
+			uint16_t hamming1 = 0;
+			uint16_t hamming2 = 0;
+			
+			//if (u+2+ds >= width) break;
+		
+			for (int m=-2; m<=2; m++) {
+				const auto v_ = (v + m);
+				for (int n=-2; n<=2; n++) {
+					const auto u_ = u + n;
+					
+					// (18)
+					auto l1 = l_cache_l1[m+2][n+2];
+					auto l2 = l_cache_l2[m+2][n+2];
+					auto r1 = uint2asull(tex2D<uint2>(censusL, u_+d, v_));
+					auto r2 = uint2asull(tex2D<uint2>(censusR, u_-d, v_));
+					hamming1 += __popcll(r1^l1);
+					hamming2 += __popcll(r2^l2);
+				}
+			}
+			
+			// Find the two minimum costs
+			if (hamming1 < min_disp1) {
+				min_before1 = last_ham1;
+				min_disp1 = hamming1;
+				dix1 = d;
+			} else if (hamming1 < min_disp1b) {
+				min_disp1b = hamming1;
+			}
+			if (dix1 == d) min_after1 = hamming1;
+			last_ham1 = hamming1;
+			
+			if (hamming2 < min_disp2) {
+				min_before2 = last_ham2;
+				min_disp2 = hamming2;
+				dix2 = d;
+			} else if (hamming2 < min_disp2b) {
+				min_disp2b = hamming2;
+			}
+			if (dix2 == d) min_after2 = hamming2;
+			last_ham2 = hamming2;
+		
+		}
+		
+		//float d1 = (dix1 == 0 || dix1 == ds-1) ? (float)dix1 : fit_parabola(dix1, min_disp1, min_before1, min_after1);
+		//float d2 = (dix2 == 0 || dix2 == ds-1) ? (float)dix2 : fit_parabola(dix2, min_disp2, min_before2, min_after2);
+	
+		// TODO Allow for discontinuities with threshold
+		// Subpixel disparity (20)
+		float d1 = fit_parabola(dix1, min_disp1, min_before1, min_after1);
+		float d2 = fit_parabola(dix2, min_disp2, min_before2, min_after2);
+	
+		// Confidence filter based on (25)
+		disp_l[v*pitchL+u] = ((min_disp2b - min_disp2) >= gamma) ? d2 : NAN;
+		disp_r[v*pitchR+u] = ((min_disp1b - min_disp1) >= gamma) ? d1 : NAN;
+	}
+}
+
+void rtcensus_sgm_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) {
+	// Make all the required texture steps
+	// TODO Could reduce re-allocations by caching these
+	ftl::cuda::TextureObject<uchar4> texLeft(l);
+	ftl::cuda::TextureObject<uchar4> texRight(r);
+	ftl::cuda::TextureObject<uint2> censusTexLeft(l.cols, l.rows);
+	ftl::cuda::TextureObject<uint2> censusTexRight(r.cols, r.rows);
+	ftl::cuda::TextureObject<float> dispTexLeft(l.cols, l.rows);
+	ftl::cuda::TextureObject<float> dispTexRight(r.cols, r.rows);
+	ftl::cuda::TextureObject<float> dispTex(r.cols, r.rows);
+	ftl::cuda::TextureObject<float> output(disp);
+	
+	// Calculate the census for left and right (14)(15)(16)
+	ftl::cuda::sparse_census(texLeft, texRight, censusTexLeft, censusTexRight);
+
+	dim3 grid(1,1,1);
+    dim3 threads(BLOCK_W, 1, 1);
+	grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W);
+	grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD);
+	
+	// Calculate L and R disparities (18)(19)(20)(21)(22)(25)
+	disp_kernel<<<grid, threads>>>(
+		dispTexLeft.devicePtr(), dispTexRight.devicePtr(),
+		dispTexLeft.pitch()/sizeof(float), dispTexRight.pitch()/sizeof(float),
+		l.cols, l.rows,
+		censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(),
+		num_disp);
+	cudaSafeCall( cudaGetLastError() );
+	
+	// Check consistency between L and R disparities. (23)(24)
+	consistency(dispTexLeft, dispTexRight, dispTex);
+
+	// TM in (7) of paper [3]. Eq (26) in [1] is wrong.
+	texture_filter(texLeft, dispTex, output, num_disp, 10.0);
+
+	cudaSafeCall( cudaDeviceSynchronize() );
+	
+	texLeft.free();
+	texRight.free();
+	censusTexLeft.free();
+	censusTexRight.free();
+	dispTexLeft.free();
+	dispTexRight.free();
+	dispTex.free();
+	output.free();
+}
+
+};
+};
-- 
GitLab