From 9ba418e6519842e178333b848cf98a7f26f0a0b0 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Tue, 3 Nov 2020 19:56:16 +0200
Subject: [PATCH] MLS multi with feature

---
 .../cpp/include/ftl/cuda_matrix_util.hpp      |   4 +-
 components/operators/CMakeLists.txt           |   1 +
 .../operators/cuda/mls/multi_intensity.hpp    |  76 ++++++
 .../fusion/smoothing/mls_multi_weighted.cu    | 256 ++++++++++++++++++
 components/renderers/cpp/src/colouriser.cpp   |   2 +-
 5 files changed, 336 insertions(+), 3 deletions(-)
 create mode 100644 components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp
 create mode 100644 components/operators/src/fusion/smoothing/mls_multi_weighted.cu

diff --git a/components/common/cpp/include/ftl/cuda_matrix_util.hpp b/components/common/cpp/include/ftl/cuda_matrix_util.hpp
index 4dd1db7fa..cbb82052d 100644
--- a/components/common/cpp/include/ftl/cuda_matrix_util.hpp
+++ b/components/common/cpp/include/ftl/cuda_matrix_util.hpp
@@ -750,7 +750,7 @@ public:
 	}
 
 	//! returns the 3x3 part of the matrix
-	inline __device__ __host__ float3x3 getFloat3x3() {
+	inline __device__ __host__ float3x3 getFloat3x3() const {
 		float3x3 ret;
 		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
 		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
@@ -1086,7 +1086,7 @@ public:
 
 
 	//! returns the 3x3 part of the matrix
-	inline __device__ __host__ float3x3 getFloat3x3() {
+	inline __device__ __host__ float3x3 getFloat3x3() const {
 		float3x3 ret;
 		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
 		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index 3ffb6a0c8..e1b756996 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -26,6 +26,7 @@ set(OPERSRC
 	src/fusion/correspondence_depth.cu
 	src/fusion/correspondence_util.cu
 	src/fusion/mls_aggr.cu
+	src/fusion/smoothing/mls_multi_weighted.cu
 	src/misc/clipping.cpp
 	src/disparity/depth.cpp
 	src/analysis/tracking/detectandtrack.cpp
diff --git a/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp
new file mode 100644
index 000000000..884fd0da4
--- /dev/null
+++ b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp
@@ -0,0 +1,76 @@
+#ifndef _FTL_CUDA_MLS_MULTIINTENSITY_HPP_
+#define _FTL_CUDA_MLS_MULTIINTENSITY_HPP_
+
+#include <ftl/cuda_common.hpp>
+#include <ftl/rgbd/camera.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+
+namespace ftl {
+namespace cuda {
+
+// TODO: 4th channel of normals could be used as curvature estimate? Then can
+// adaptively adjust the smoothing amount by the degree of curvature. This
+// curvature value must be low for noise data, is only high if a consistent
+// high curvature is detected over a radius. Other option is to also or instead
+// use the curvature value as a feature, to only smooth against points with
+// similar curvature?
+
+/**
+ * For a particular viewpoint, use a set of other viewpoints to estimate a
+ * smooth surface for the focus viewpoint. This version of MLS uses absolute
+ * difference of some 8-bit value as a weight, this can be raw intensity or
+ * a local contrast measure. This `prime`, `gather`, `adjust` cycle should
+ * be iterated 2-3 times, but perhaps better to do one iteration of each view
+ * first before repeating a view.
+ * 
+ * Note: Have one instance per frameset because the internal buffers are
+ * allocated based upon frame image size.
+ */
+class MLSMultiIntensity
+{
+public:
+	MLSMultiIntensity(int radius);
+	~MLSMultiIntensity();
+
+	void prime(
+		const cv::cuda::GpuMat &depth_prime,
+		const cv::cuda::GpuMat &intensity_prime,
+		const ftl::rgbd::Camera &cam_prime,
+		const float4x4 &pose_prime,
+		cudaStream_t stream
+	);
+
+	void gatherPrime(float smoothing, cudaStream_t stream);
+
+	void gather(
+		const cv::cuda::GpuMat &depth_src,
+		const cv::cuda::GpuMat &normals_src,
+		const cv::cuda::GpuMat &intensity_src,
+		const ftl::rgbd::Camera &cam_src,
+		const float4x4 &pose_src,
+		float smoothing,
+		cudaStream_t stream
+	);
+
+	void adjust(
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &normals_out,
+		cudaStream_t stream
+	);
+
+private:
+	cv::cuda::GpuMat depth_prime_;
+	cv::cuda::GpuMat intensity_prime_;
+	ftl::rgbd::Camera cam_prime_;
+	float4x4 pose_prime_;
+	int radius_;
+
+	cv::cuda::GpuMat normal_accum_;
+	cv::cuda::GpuMat centroid_accum_;
+	cv::cuda::GpuMat weight_accum_;
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/components/operators/src/fusion/smoothing/mls_multi_weighted.cu b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu
new file mode 100644
index 000000000..a5e5ded31
--- /dev/null
+++ b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu
@@ -0,0 +1,256 @@
+#include <ftl/operators/cuda/mls/multi_intensity.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#include <ftl/cuda/weighting.hpp>
+
+using ftl::cuda::MLSMultiIntensity;
+using cv::cuda::GpuMat;
+
+// ==== Multi image MLS ========================================================
+
+__device__ inline float featureWeight(int f1, int f2) {
+	const float w = (1.0f-(float(abs(f1 - f2)) / 255.0f));
+	return w*w;
+}
+
+/*
+ * Gather points for Moving Least Squares, from each source image
+ */
+ template <int SEARCH_RADIUS, typename T>
+ __global__ void mls_gather_intensity_kernel(
+	const half4* __restrict__ normals_in,
+	half4* __restrict__ normals_out,
+	const float* __restrict__ depth_origin,
+	const float* __restrict__ depth_in,
+	const T* __restrict__ feature_origin,
+	const T* __restrict__ feature_in,
+	float4* __restrict__ centroid_out,
+	float* __restrict__ contrib_out,
+	float smoothing,
+	float4x4 o_2_in,
+	float4x4 in_2_o,
+	float3x3 in_2_o33,
+	ftl::rgbd::Camera camera_origin,
+	ftl::rgbd::Camera camera_in,
+	int npitch_out,
+	int cpitch_out,
+	int wpitch_out,
+	int dpitch_o,
+	int dpitch_i,
+	int npitch_in,
+	int fpitch_o,
+	int fpitch_i
+) {        
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < 0 || y < 0 || x >= camera_origin.width || y >= camera_origin.height) return;
+
+	float3 nX = make_float3(normals_out[y*npitch_out+x]);
+	float3 aX = make_float3(centroid_out[y*cpitch_out+x]);
+    float contrib = contrib_out[y*wpitch_out+x];
+
+	float d0 = depth_origin[x+y*dpitch_o];
+	if (d0 <= camera_origin.minDepth || d0 >= camera_origin.maxDepth) return;
+
+	const int feature1 = feature_origin[x+y*fpitch_o];
+
+	float3 X = camera_origin.screenToCam((int)(x),(int)(y),d0);
+
+	int2 s = camera_in.camToScreen<int2>(o_2_in * X);
+
+    // Neighbourhood
+    for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
+    for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
+		const float d = (s.x+u >= 0 && s.x+u < camera_in.width && s.y+v >= 0 && s.y+v < camera_in.height) ? depth_in[s.x+u+(s.y+v)*dpitch_i] : 0.0f;
+		if (d <= camera_in.minDepth || d >= camera_in.maxDepth) continue;
+
+		// Point and normal of neighbour
+		const float3 Xi = in_2_o * camera_in.screenToCam(s.x+u, s.y+v, d);
+		const float3 Ni = make_float3(normals_in[s.x+u+(s.y+v)*npitch_in]);
+
+		const int feature2 = feature_in[s.x+y+(s.y+v)*fpitch_i];
+
+		// Gauss approx weighting function using point distance
+		const float w = (length(Ni) > 0.0f)
+			? ftl::cuda::spatialWeighting(X,Xi,smoothing) * featureWeight(feature1, feature2)
+			: 0.0f;
+
+		aX += Xi*w;
+		nX += (in_2_o33 * Ni)*w;
+		contrib += w;
+    }
+	}
+
+	normals_out[y*npitch_out+x] = make_half4(nX, 0.0f);
+	centroid_out[y*cpitch_out+x] = make_float4(aX, 0.0f);
+	contrib_out[y*wpitch_out+x] = contrib;
+}
+
+/**
+ * Convert accumulated values into estimate of depth and normals at pixel.
+ */
+ __global__ void mls_reduce_kernel_2(
+	const float4* __restrict__ centroid,
+	const half4* __restrict__ normals,
+	const float* __restrict__ contrib_out,
+	half4* __restrict__ normals_out,
+	float* __restrict__ depth,
+	ftl::rgbd::Camera camera,
+	int npitch_in,
+	int cpitch_in,
+	int wpitch,
+	int npitch,
+	int dpitch
+) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
+		float3 nX = make_float3(normals[y*npitch_in+x]);
+		float3 aX = make_float3(centroid[y*cpitch_in+x]);
+		float contrib = contrib_out[y*wpitch+x];
+
+		//depth[x+y*dpitch] = X.z;
+		normals_out[x+y*npitch] = make_half4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		float d0 = depth[x+y*dpitch];
+		//depth[x+y*dpitch] = 0.0f;
+		if (d0 <= camera.minDepth || d0 >= camera.maxDepth || contrib == 0.0f) return;
+		float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+		
+		nX /= contrib;  // Weighted average normal
+		aX /= contrib;  // Weighted average point (centroid)
+
+		// Signed-Distance Field function
+		float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
+
+		// Calculate new point using SDF function to adjust depth (and position)
+		X = X - nX * fX;
+
+		depth[x+y*dpitch] = X.z;
+		normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f);
+	}
+}
+
+
+MLSMultiIntensity::MLSMultiIntensity(int radius)
+	: radius_(radius)
+{
+
+}
+
+MLSMultiIntensity::~MLSMultiIntensity()
+{
+
+}
+
+void MLSMultiIntensity::prime(
+	const GpuMat &depth_prime,
+	const GpuMat &intensity_prime,
+	const ftl::rgbd::Camera &cam_prime,
+	const float4x4 &pose_prime,
+	cudaStream_t stream)
+{
+	depth_prime_ = depth_prime;
+	intensity_prime_ = intensity_prime;
+	cam_prime_ = cam_prime;
+	pose_prime_ = pose_prime;
+
+	centroid_accum_.create(depth_prime.size(), CV_32FC4);
+	normal_accum_.create(depth_prime.size(), CV_16FC4);
+	weight_accum_.create(depth_prime.size(), CV_32F);
+
+	cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+	// Reset buffers
+	centroid_accum_.setTo(cv::Scalar(0,0,0,0), cvstream);
+	weight_accum_.setTo(cv::Scalar(0), cvstream);
+	normal_accum_.setTo(cv::Scalar(0,0,0,0), cvstream);
+}
+
+void MLSMultiIntensity::gatherPrime(float smoothing, cudaStream_t stream)
+{
+	// Can use a simpler kernel without pose transformations
+}
+
+void MLSMultiIntensity::gather(
+	const GpuMat &depth_src,
+	const GpuMat &normals_src,
+	const GpuMat &intensity_src,
+	const ftl::rgbd::Camera &cam_src,
+	const float4x4 &pose_src,
+	float smoothing,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	float4x4 inv_pose_src = pose_src;
+	inv_pose_src.invert();
+	float4x4 o_2_in = inv_pose_src * pose_prime_;
+	float4x4 inv_pose_prime = pose_prime_;
+	inv_pose_prime.invert();
+	float4x4 in_2_o = inv_pose_prime * pose_src;
+	float3x3 in_2_o33 = inv_pose_prime.getFloat3x3() * pose_src.getFloat3x3();
+
+	mls_gather_intensity_kernel<3><<<gridSize, blockSize, 0, stream>>>(
+		normals_src.ptr<half4>(),
+		normal_accum_.ptr<half4>(),
+		depth_prime_.ptr<float>(),
+		depth_src.ptr<float>(),
+		intensity_prime_.ptr<uchar>(),
+		intensity_src.ptr<uchar>(),
+		centroid_accum_.ptr<float4>(),
+		weight_accum_.ptr<float>(),
+		smoothing,
+		o_2_in,
+		in_2_o,
+		in_2_o33,
+		cam_prime_,
+		cam_src,
+		normal_accum_.step1()/4,
+		centroid_accum_.step1()/4,
+		weight_accum_.step1(),
+		depth_prime_.step1(),
+		depth_src.step1(),
+		normals_src.step1()/4,
+		intensity_prime_.step1(),
+		intensity_src.step1()
+	);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+void MLSMultiIntensity::adjust(
+	GpuMat &depth_out,
+	GpuMat &normals_out,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	normals_out.create(depth_prime_.size(), CV_16FC4);
+	depth_out.create(depth_prime_.size(), CV_32F);
+
+	// FIXME: Depth prime assumed to be same as depth out
+
+	mls_reduce_kernel_2<<<gridSize, blockSize, 0, stream>>>(
+		centroid_accum_.ptr<float4>(),
+		normal_accum_.ptr<half4>(),
+		weight_accum_.ptr<float>(),
+		normals_out.ptr<half4>(),
+		depth_prime_.ptr<float>(),
+		cam_prime_,
+		normal_accum_.step1()/4,
+		centroid_accum_.step1()/4,
+		weight_accum_.step1(),
+		normals_out.step1()/4,
+		depth_prime_.step1()
+	);
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/renderers/cpp/src/colouriser.cpp b/components/renderers/cpp/src/colouriser.cpp
index a8399d016..2412aeb86 100644
--- a/components/renderers/cpp/src/colouriser.cpp
+++ b/components/renderers/cpp/src/colouriser.cpp
@@ -143,7 +143,7 @@ TextureObject<uchar4> &Colouriser::_processNormals(ftl::rgbd::Frame &f, Channel
 	light_diffuse.w = value("alpha", 0.5f)*255.0f;
 	light_ambient.w = light_diffuse.w;
 
-	auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", 1.0f));
+	auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", -1.0f));
 
 	ftl::cuda::normal_visualise(f.createTexture<half4>(c), buf,
 			light_pos,
-- 
GitLab