Merge branch 'feature/237/optmls2' into 'master'

Resolves #237 optimising MLS smoothing Closes #237 See merge request nicolas.pope/ftl!167

Merge branch 'feature/237/optmls2' into 'master'
3ec52ef6 · Nicolas Pope · 63ef42cf · dd857ff3 · 3ec52ef6 · 3ec52ef6
Commit 3ec52ef6 authored 5 years ago by Nicolas Pope
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -319,7 +319,7 @@ static void run(ftl::Configurable *root) {
 	pipeline1->append<ftl::operators::CrossSupport>("cross");
 	pipeline1->append<ftl::operators::DiscontinuityMask>("discontinuity");
 	pipeline1->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
-	pipeline1->append<ftl::operators::ColourMLS>("mls");  // Perform MLS (using smoothing channel)
+	pipeline1->append<ftl::operators::AggreMLS>("mls");  // Perform MLS (using smoothing channel)
 	pipeline1->append<ftl::operators::VisCrossSupport>("viscross")->set("enabled", false);
 	// Alignment


--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -183,7 +183,7 @@ TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) {
 	height_ = d.rows;
 	needsfree_ = false;
 	cvType_ = ftl::traits::OpenCVType<T>::value;
-	//needsdestroy_ = true;
+	needsdestroy_ = true;
 }

 #endif  // __CUDACC__
@@ -218,7 +218,7 @@ TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) {
 	height_ = d.rows;
 	needsfree_ = false;
 	cvType_ = ftl::traits::OpenCVType<T>::value;
-	//needsdestroy_ = true;
+	needsdestroy_ = true;
 }

 /**
@@ -252,7 +252,7 @@ TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
 	height_ = height;
 	needsfree_ = false;
 	cvType_ = ftl::traits::OpenCVType<T>::value;
-	//needsdestroy_ = true;
+	needsdestroy_ = true;
 }

 template <typename T>
@@ -285,7 +285,7 @@ TextureObject<T>::TextureObject(size_t width, size_t height) {
 	needsfree_ = true;
 	pitch2_ = pitch_ / sizeof(T);
 	cvType_ = ftl::traits::OpenCVType<T>::value;
-	//needsdestroy_ = true;
+	needsdestroy_ = true;
 }

 #ifndef __CUDACC__
@@ -312,6 +312,7 @@ TextureObject<T>::TextureObject(const TextureObject<T> &p) {
 	pitch2_ = pitch_ / sizeof(T);
 	cvType_ = ftl::traits::OpenCVType<T>::value;
 	needsfree_ = false;
+	needsdestroy_ = false;
 }

 template <typename T>
@@ -323,14 +324,17 @@ TextureObject<T>::TextureObject(TextureObject<T> &&p) {
 	pitch_ = p.pitch_;
 	pitch2_ = pitch_ / sizeof(T);
 	needsfree_ = p.needsfree_;
+	needsdestroy_ = p.needsdestroy_;
 	p.texobj_ = 0;
 	p.needsfree_ = false;
+	p.needsdestroy_ = false;
 	p.ptr_ = nullptr;
 	cvType_ = ftl::traits::OpenCVType<T>::value;
 }

 template <typename T>
 TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) {
+	free();
 	texobj_ = p.texobj_;
 	ptr_ = p.ptr_;
 	width_ = p.width_;
@@ -339,11 +343,13 @@ TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) {
 	pitch2_ = pitch_ / sizeof(T);
 	cvType_ = ftl::traits::OpenCVType<T>::value;
 	needsfree_ = false;
+	needsdestroy_ = false;
 	return *this;
 }

 template <typename T>
 TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) {
+	free();
 	texobj_ = p.texobj_;
 	ptr_ = p.ptr_;
 	width_ = p.width_;
@@ -351,8 +357,10 @@ TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) {
 	pitch_ = p.pitch_;
 	pitch2_ = pitch_ / sizeof(T);
 	needsfree_ = p.needsfree_;
+	needsdestroy_ = p.needsdestroy_;
 	p.texobj_ = 0;
 	p.needsfree_ = false;
+	p.needsdestroy_ = false;
 	p.ptr_ = nullptr;
 	cvType_ = ftl::traits::OpenCVType<T>::value;
 	return *this;

--- a/components/operators/include/ftl/operators/smoothing.hpp
+++ b/components/operators/include/ftl/operators/smoothing.hpp
@@ -79,6 +79,51 @@ class ColourMLS : public ftl::operators::Operator {
 	private:
 };

+/**
+ * Use cross aggregation of neighbors for faster MLS smoothing. Utilises the
+ * cross support regions and colour weighting. This algorithm is based upon the
+ * cross aggregation method in the papers below. MLS was adapted into an
+ * approximate form which is not identical to real MLS as the weights are not
+ * perfectly correct for each point. The errors are visually minimal and it
+ * has linear performance wrt radius, rather than quadratic.
+ * 
+ * Zhang K, Lu J, Lafruit G. Cross-based local stereo matching using orthogonal
+ * integral images. (2009).
+ * 
+ * Better explained in:
+ * X. Mei, X. Sun, M. Zhou et al. On building an accurate stereo matching system
+ * on graphics hardware. (2011).
+ * 
+ * The use of colour weighting is done as in:
+ * C. Kuster et al. Spatio-temporal geometry fusion for multiple hybrid cameras
+ * using moving least squares surfaces. (2014)
+ * 
+ * The above paper also indicates the importance of stopping MLS at depth
+ * boundaries. They use k-means clustering (K=2) of depths, but we are using
+ * an approach (discontinuity mask) indicated in the following paper combined
+ * with the cross support regions to define the extent of the MLS neighbourhood.
+ * 
+ * S. Orts-Escolano et al. Holoportation: Virtual 3D teleportation in real-time.
+ * (2016).
+ * 
+ * The use of all these approaches in this combination is novel I believe.
+ * 
+ */
+class AggreMLS : public ftl::operators::Operator {
+	public:
+	explicit AggreMLS(ftl::Configurable*);
+	~AggreMLS();
+
+	inline Operator::Type type() const override { return Operator::Type::OneToOne; }
+
+	bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgbd::Source *src, cudaStream_t stream) override;
+
+	private:
+	ftl::cuda::TextureObject<float4> centroid_horiz_;
+	ftl::cuda::TextureObject<float4> centroid_vert_;
+	ftl::cuda::TextureObject<float4> normals_horiz_;
+};
+
 /**
 * A version of Moving Least Squares where both the smoothing factor and
 * neighbourhood size are adapted over an extra large range using the colour

--- a/components/operators/src/mls.cu
+++ b/components/operators/src/mls.cu
@@ -5,6 +5,7 @@
 using ftl::cuda::TextureObject;

 #define T_PER_BLOCK 8
+#define WARP_SIZE 32

 // ===== MLS Smooth ============================================================

@@ -229,7 +230,7 @@ __device__ inline int segmentID(int u, int v) {
 * level of single pixels can be subject to noise. Colour noise should first
 * be removed from the image.
 */
- template <bool FILLING>
+ template <bool FILLING, int RADIUS>
 __global__ void colour_mls_smooth_csr_kernel(
 	 	TextureObject<uchar4> region,
 		TextureObject<float4> normals_in,
@@ -268,34 +269,38 @@ __device__ inline int segmentID(int u, int v) {
 	// TODO: Does using a fixed loop size with range test work better?
 	// Or with warp per pixel version, this would be less of a problem...
 	// TODO: Do a separate vote fill step?
-	for (int v=-base.z; v<=base.w; ++v) {
+	for (int v=-RADIUS; v<=RADIUS; ++v) {
 		uchar4 baseY = region.tex2D(x,y+v);

-		for (int u=-baseY.x; u<=baseY.y; ++u) {
+		#pragma unroll
+		for (int u=-RADIUS; u<=RADIUS; ++u) {
 			const float d = depth_in.tex2D(x+u, y+v);
-			if (d > camera.minDepth && d < camera.maxDepth) {
+			//if (d > camera.minDepth && d < camera.maxDepth) {
+
+				float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -baseY.x || u > baseY.y || v < -base.z || v > base.z) ? 0.0f : 1.0f;

 				// Point and normal of neighbour
 				const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
 				const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));

-				if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
+				// FIXME: Ensure bad normals are removed by setting depth invalid
+				//if (Ni.x+Ni.y+Ni.z == 0.0f) continue;

 				const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
-				const float cw = ftl::cuda::colourWeighting(c0,c,colour_smoothing);
+				w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);

 				// Allow missing point to borrow z value
 				// TODO: This is a bad choice of Z! Perhaps try histogram vote approach
-				if (FILLING && d0 == 0.0f) X = camera.screenToCam((int)(x),(int)(y),Xi.z);
+				//if (FILLING && d0 == 0.0f) X = camera.screenToCam((int)(x),(int)(y),Xi.z);

 				// Gauss approx weighting function using point distance
-				const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing*cw);
+				w = ftl::cuda::spatialWeighting(X,Xi,smoothing*w);

 				aX += Xi*w;
 				nX += Ni*w;
 				contrib += w;
-				if (FILLING && w > 0.0f && v > -base.z+1 && v < base.w-1 && u > -baseY.x+1 && u < baseY.y-1) segment_check |= segmentID(u,v);
-			}
+				//if (FILLING && w > 0.0f && v > -base.z+1 && v < base.w-1 && u > -baseY.x+1 && u < baseY.y-1) segment_check |= segmentID(u,v);
+			//}
 		}
 	}

@@ -341,11 +346,294 @@ void ftl::cuda::colour_mls_smooth_csr(
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

 	if (filling) {
-		colour_mls_smooth_csr_kernel<true><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
+		colour_mls_smooth_csr_kernel<true,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
 	} else {
-		colour_mls_smooth_csr_kernel<false><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
+		colour_mls_smooth_csr_kernel<false,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
+	}
+		
+	cudaSafeCall( cudaGetLastError() );
+
+
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	#endif
+}
+
+
+// ===== Cross Aggregate MLS ===================================================
+
+/*
+ * Smooth depth map using Moving Least Squares. This version uses colour
+ * similarity weights to adjust the spatial smoothing factor. It also uses
+ * cross support windows to prevent unwanted edge crossing. This function does
+ * the weighted aggregation of centroids and normals in the horizontal
+ * direction.
+ */
+ template <int RADIUS>
+ __global__ void mls_aggr_horiz_kernel(
+	 	TextureObject<uchar4> region,
+		TextureObject<float4> normals_in,
+		TextureObject<float4> normals_out,
+        TextureObject<float> depth_in,        // Virtual depth map
+		TextureObject<float4> centroid_out,   // Accumulated output
+		TextureObject<uchar4> colour_in,
+		float smoothing,
+		float colour_smoothing,
+        ftl::rgbd::Camera camera) {
+        
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
+
+	float3 aX = make_float3(0.0f,0.0f,0.0f);
+	float3 nX = make_float3(0.0f,0.0f,0.0f);
+    float contrib = 0.0f;
+
+	float d0 = depth_in.tex2D(x, y);
+
+	// Note: x and y flipped as output is rotated.
+	centroid_out(y,x) = make_float4(0.0f);
+	normals_out(y,x) = normals_in(x,y);
+
+	if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return;
+	
+	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
+
+    // Cross-Support Neighbourhood
+	uchar4 base = region.tex2D(x,y);
+
+	#pragma unroll
+	for (int u=-RADIUS; u<=RADIUS; ++u) {
+		const float d = depth_in.tex2D(x+u, y);
+
+		// If outside of cross support range, set weight to 0 to ignore
+		float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f;
+
+		// Point and normal of neighbour
+		const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y),d);
+		const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)));
+
+		// Bad or missing normals should be ignored
+		if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
+
+		// Gauss approx colour weighting.
+		const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y) + 0.5f);
+		w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
+
+		// Gauss approx weighting function using point distance
+		w = ftl::cuda::spatialWeighting(X,Xi,smoothing*w);
+
+		aX += Xi*w;
+		nX += Ni*w;
+		contrib += w;
+	}
+
+	if (contrib > 0.0f) {
+		nX /= contrib;  // Weighted average normal
+		aX /= contrib;  // Weighted average point (centroid)
+
+		// Note: x and y flipped since output is rotated 90 degrees.
+		centroid_out(y,x) = make_float4(aX, 0.0f);
+		normals_out(y,x) = make_float4(nX / length(nX), 0.0f);
+	}
+}
+
+/**
+ * This function does a vertical weighted aggregation of the centroids and
+ * normals generated by the horizontal aggregation.
+ */
+template <int RADIUS>
+ __global__ void mls_aggr_vert_kernel(
+	 	TextureObject<uchar4> region,
+		TextureObject<float4> normals_in,
+		TextureObject<float4> normals_out,
+        TextureObject<float4> centroid_in,        // Virtual depth map
+		TextureObject<float4> centroid_out,   // Accumulated output
+		TextureObject<uchar4> colour_in,
+		TextureObject<float> depth_in,
+		float smoothing,
+		float colour_smoothing,
+        ftl::rgbd::Camera camera) {
+        
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
+
+	float3 aX = make_float3(0.0f,0.0f,0.0f);
+	float3 nX = make_float3(0.0f,0.0f,0.0f);
+	float contrib = 0.0f;
+	
+	float d0 = depth_in.tex2D(x, y);
+	if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return;
+	
+	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+
+	centroid_out(x,y) = make_float4(0.0f);
+	normals_out(x,y) = make_float4(0.0f);
+	
+	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
+
+    // Cross-Support Neighbourhood
+	uchar4 base = region.tex2D(x,y);
+
+	#pragma unroll
+	for (int v=-RADIUS; v<=RADIUS; ++v) {
+		const float d = depth_in.tex2D(x, y+v);
+		const float3 Xi = camera.screenToCam(x,y+v,d);
+
+		// Note: x and y flipped, input image is rotated.
+		float3 Ai = make_float3(centroid_in.tex2D(y+v, x));
+
+		// If outside the cross support range, set weight to 0 to ignore
+		float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f;
+
+		// Note: x and y flipped, input image is rotated.
+		const float3 Ni = make_float3(normals_in.tex2D(y+v, x));
+
+		// Bad normals should be ignored.
+		if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
+
+		// Gauss approx colour weighting.
+		const float4 c = colour_in.tex2D(float(x) + 0.5f, float(y+v) + 0.5f);
+		w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
+
+		// Gauss approx weighting function using point distance
+		w = ftl::cuda::spatialWeighting(X,Xi,smoothing*w);
+
+		aX += Ai*w;
+		nX += Ni*w;
+		contrib += w;
 	}
+
+	// Normalise the summed points and normals
+	if (contrib > 0.0f) {
+		nX /= contrib;  // Weighted average normal
+		aX /= contrib;  // Weighted average point (centroid)
+		centroid_out(x,y) = make_float4(aX, 0.0f);
+		normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+	}
+}
+
+/**
+ * Using the aggregated centroids and normals, calculate the signed-distance-
+ * field and move the depth value accordingly using the calculated normal.
+ */
+__global__ void mls_adjust_depth_kernel(
+		TextureObject<float4> normals_in,
+		TextureObject<float4> centroid_in,        // Virtual depth map
+		TextureObject<float> depth_in,
+		TextureObject<float> depth_out,
+		ftl::rgbd::Camera camera) {
+	
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < 0 || y < 0 || x >= depth_out.width() || y >= depth_out.height()) return;
+
+	float3 aX = make_float3(centroid_in(x,y));
+	float3 nX = make_float3(normals_in(x,y));
+
+	float d0 = depth_in.tex2D(x, y);
+	depth_out(x,y) = d0;
+
+	if (d0 > camera.minDepth && d0 < camera.maxDepth && aX.z > camera.minDepth && aX.z < camera.maxDepth) {
+		float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+
+		// Signed-Distance Field function
+		float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
+
+		// Calculate new point using SDF function to adjust depth (and position)
+		X = X - nX * fX;
 		
+		depth_out(x,y) = X.z;
+	}
+}
+
+
+void ftl::cuda::mls_aggr_horiz(
+		ftl::cuda::TextureObject<uchar4> &region,
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<float> &depth_in,
+		ftl::cuda::TextureObject<float4> &centroid_out,
+		ftl::cuda::TextureObject<uchar4> &colour_in,
+		float smoothing,
+		float colour_smoothing,
+		int radius,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((normals_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	switch(radius) {
+	case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	default: return;
+	}
+	cudaSafeCall( cudaGetLastError() );
+
+
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	#endif
+}
+
+void ftl::cuda::mls_aggr_vert(
+		ftl::cuda::TextureObject<uchar4> &region,
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<float4> &centroid_in,
+		ftl::cuda::TextureObject<float4> &centroid_out,
+		ftl::cuda::TextureObject<uchar4> &colour_in,
+		ftl::cuda::TextureObject<float> &depth_in,
+		float smoothing,
+		float colour_smoothing,
+		int radius,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((normals_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	switch(radius) {
+	case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	default: return;
+	}
+	cudaSafeCall( cudaGetLastError() );
+
+
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	#endif
+}
+
+void ftl::cuda::mls_adjust_depth(
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &centroid_in,
+		ftl::cuda::TextureObject<float> &depth_in,
+		ftl::cuda::TextureObject<float> &depth_out,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_in, depth_out, camera);
 	cudaSafeCall( cudaGetLastError() );



--- a/components/operators/src/smoothing.cpp
+++ b/components/operators/src/smoothing.cpp
@@ -6,6 +6,7 @@
 using ftl::operators::HFSmoother;
 using ftl::operators::SimpleMLS;
 using ftl::operators::ColourMLS;
+using ftl::operators::AggreMLS;
 using ftl::operators::AdaptiveMLS;
 using ftl::operators::SmoothChannel;
 using ftl::codecs::Channel;
@@ -230,6 +231,102 @@ bool ColourMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgbd::So
 }


+// ====== Aggregating MLS ======================================================
+
+AggreMLS::AggreMLS(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {
+
+}
+
+AggreMLS::~AggreMLS() {
+
+}
+
+bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgbd::Source *s, cudaStream_t stream) {
+	float thresh = config()->value("mls_threshold", 0.4f);
+	float col_smooth = config()->value("mls_colour_smoothing", 30.0f);
+	int iters = config()->value("mls_iterations", 3);
+	int radius = config()->value("mls_radius",5);
+	bool aggre = config()->value("aggregation", true);
+
+	if (!in.hasChannel(Channel::Normals)) {
+		LOG(ERROR) << "Required normals channel missing for MLS";
+		return false;
+	}
+
+	auto size = in.get<GpuMat>(Channel::Depth).size();
+	centroid_horiz_.create(size.height, size.width);
+	normals_horiz_.create(size.height, size.width);
+	centroid_vert_.create(size.width, size.height);
+
+	// FIXME: Assume in and out are the same frame.
+	for (int i=0; i<iters; ++i) {
+
+		if (aggre) {
+			ftl::cuda::mls_aggr_horiz(
+				in.createTexture<uchar4>(Channel::Support1),
+				in.createTexture<float4>(Channel::Normals),
+				normals_horiz_,
+				in.createTexture<float>(Channel::Depth),
+				centroid_horiz_,
+				in.createTexture<uchar4>(Channel::Colour),
+				thresh,
+				col_smooth,
+				radius,
+				s->parameters(),
+				stream
+			);
+
+			ftl::cuda::mls_aggr_vert(
+				in.createTexture<uchar4>(Channel::Support1),
+				normals_horiz_,
+				in.createTexture<float4>(Channel::Normals),
+				centroid_horiz_,
+				centroid_vert_,
+				in.createTexture<uchar4>(Channel::Colour),
+				in.createTexture<float>(Channel::Depth),
+				thresh,
+				col_smooth,
+				radius,
+				s->parameters(),
+				stream
+			);
+
+			ftl::cuda::mls_adjust_depth(
+				in.createTexture<float4>(Channel::Normals),
+				centroid_vert_,
+				in.createTexture<float>(Channel::Depth),
+				in.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)),
+				s->parameters(),
+				stream
+			);
+
+			in.swapChannels(Channel::Depth, Channel::Depth2);
+			//in.swapChannels(Channel::Normals, Channel::Points);
+
+		} else {
+			ftl::cuda::colour_mls_smooth_csr(
+				in.createTexture<uchar4>(Channel::Support1),
+				in.createTexture<float4>(Channel::Normals),
+				in.createTexture<float4>(Channel::Points, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+				in.createTexture<float>(Channel::Depth),
+				in.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+				in.createTexture<uchar4>(Channel::Colour),
+				thresh,
+				col_smooth,
+				false,
+				s->parameters(),
+				stream
+			);
+
+			in.swapChannels(Channel::Depth, Channel::Depth2);
+			in.swapChannels(Channel::Normals, Channel::Points);
+		}
+	}
+
+	return true;
+}
+
+
 // ====== Adaptive MLS =========================================================

 AdaptiveMLS::AdaptiveMLS(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {

--- a/components/operators/src/smoothing_cuda.hpp
+++ b/components/operators/src/smoothing_cuda.hpp
@@ -42,6 +42,41 @@ void colour_mls_smooth_csr(
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);

+void mls_adjust_depth(
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &centroid_in,
+		ftl::cuda::TextureObject<float> &depth_in,
+		ftl::cuda::TextureObject<float> &depth_out,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream);
+
+void mls_aggr_horiz(
+		ftl::cuda::TextureObject<uchar4> &region,
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<float> &depth_in,
+		ftl::cuda::TextureObject<float4> &centroid_out,
+		ftl::cuda::TextureObject<uchar4> &colour_in,
+		float smoothing,
+		float colour_smoothing,
+		int radius,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream);
+
+void mls_aggr_vert(
+		ftl::cuda::TextureObject<uchar4> &region,
+		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<float4> &centroid_in,
+		ftl::cuda::TextureObject<float4> &centroid_out,
+		ftl::cuda::TextureObject<uchar4> &colour_in,
+		ftl::cuda::TextureObject<float> &depth_in,
+		float smoothing,
+		float colour_smoothing,
+		int radius,
+		const ftl::rgbd::Camera &camera,
+		cudaStream_t stream);
+
 void adaptive_mls_smooth(
 		ftl::cuda::TextureObject<float4> &normals_in,
 		ftl::cuda::TextureObject<float4> &normals_out,