diff --git a/components/common/cpp/include/ftl/cuda_matrix_util.hpp b/components/common/cpp/include/ftl/cuda_matrix_util.hpp
index 4dd1db7fa8a58c84a40f9d93abd8cc0a492b2792..cbb82052da87e2e5ddca271afe79233639a379bb 100644
--- a/components/common/cpp/include/ftl/cuda_matrix_util.hpp
+++ b/components/common/cpp/include/ftl/cuda_matrix_util.hpp
@@ -750,7 +750,7 @@ public:
 	}
 
 	//! returns the 3x3 part of the matrix
-	inline __device__ __host__ float3x3 getFloat3x3() {
+	inline __device__ __host__ float3x3 getFloat3x3() const {
 		float3x3 ret;
 		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
 		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
@@ -1086,7 +1086,7 @@ public:
 
 
 	//! returns the 3x3 part of the matrix
-	inline __device__ __host__ float3x3 getFloat3x3() {
+	inline __device__ __host__ float3x3 getFloat3x3() const {
 		float3x3 ret;
 		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
 		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index 5eb5cb71c41692fb95bbc6c9300396f36766d588..23a80cb7c0fd20bfd9d63842a467051d737f415c 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -1,39 +1,43 @@
 set(OPERSRC
-	src/smoothing.cpp
-	src/smoothing.cu
-	src/mls.cu
-	src/smoothchan.cu
+	src/surface/smoothing.cpp
+	src/surface/smoothing.cu
+	src/surface/mls/image_basic.cu
+	src/surface/mls.cu
+	src/analysis/local/smoothchan.cu
 	src/operator.cpp
-	src/colours.cpp
-	src/normals.cpp
-	src/filling.cpp
-	src/filling.cu
+	src/misc/colours.cpp
+	src/analysis/local/normals.cpp
+	src/surface/filling.cpp
+	src/surface/filling.cu
 	src/disparity/libstereo.cpp
 	src/disparity/disp2depth.cu
 	src/disparity/disparity_to_depth.cpp
 	src/disparity/bilateral_filter.cpp
 	src/disparity/opencv/disparity_bilateral_filter.cpp
 	src/disparity/opencv/disparity_bilateral_filter.cu
-	src/segmentation.cu
-	src/segmentation.cpp
-	src/mask.cu
-	src/mask.cpp
-	src/antialiasing.cpp
-	src/antialiasing.cu
+	src/analysis/segmentation/segmentation.cu
+	src/analysis/segmentation/segmentation.cpp
+	src/analysis/local/mask.cu
+	src/analysis/local/mask.cpp
+	src/misc/antialiasing.cpp
+	src/misc/antialiasing.cu
 	src/fusion/mvmls.cpp
 	src/fusion/correspondence.cu
 	src/fusion/correspondence_depth.cu
 	src/fusion/correspondence_util.cu
 	src/fusion/mls_aggr.cu
-	src/clipping.cpp
-	src/depth.cpp
-	src/detectandtrack.cpp
-	src/aruco.cpp
-	src/weighting.cpp
-	src/weighting.cu
-	src/poser.cpp
-	src/gt_analysis.cpp
-	src/gt_analysis.cu
+	src/fusion/smoothing/mls_multi_weighted.cu
+	src/fusion/carving/carver.cu
+	src/fusion/fusion.cpp
+	src/misc/clipping.cpp
+	src/disparity/depth.cpp
+	src/analysis/tracking/detectandtrack.cpp
+	src/analysis/tracking/aruco.cpp
+	src/analysis/local/weighting.cpp
+	src/analysis/local/weighting.cu
+	src/misc/poser.cpp
+	src/analysis/evaluation/gt_analysis.cpp
+	src/analysis/evaluation/gt_analysis.cu
 )
 
 if (HAVE_LIBSGM)
@@ -42,8 +46,8 @@ endif (HAVE_LIBSGM)
 
 if (HAVE_OPTFLOW)
 	list(APPEND OPERSRC
-		src/nvopticalflow.cpp
-		src/opticalflow.cu
+		src/analysis/tracking/nvopticalflow.cpp
+		src/analysis/tracking/opticalflow.cu
 		src/disparity/optflow_smoothing.cu
 		src/disparity/optflow_smoothing.cpp)
 endif()
diff --git a/components/operators/include/ftl/cuda/fixed.hpp b/components/operators/include/ftl/cuda/fixed.hpp
index efb0c09caabd5e41f024679e8e80a2c6ea2b6df3..16e1e6ae0769fbfa16bad8931a58ac23ff43cb7b 100644
--- a/components/operators/include/ftl/cuda/fixed.hpp
+++ b/components/operators/include/ftl/cuda/fixed.hpp
@@ -3,7 +3,7 @@
 
 template <int FRAC>
 __device__ inline float fixed2float(short v) {
-    return v / (1 << FRAC);
+    return float(v) / float(1 << FRAC);
 }
 
 template <int FRAC>
diff --git a/components/operators/include/ftl/operators/cuda/carver.hpp b/components/operators/include/ftl/operators/cuda/carver.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dd0c04a884610211ca7e6ffa8c19702ff1f25cb
--- /dev/null
+++ b/components/operators/include/ftl/operators/cuda/carver.hpp
@@ -0,0 +1,34 @@
+#ifndef _FTL_CUDA_CARVER_HPP_
+#define _FTL_CUDA_CARVER_HPP_
+
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/rgbd/camera.hpp>
+
+namespace ftl {
+namespace cuda {
+
+/**
+ * Carve `in` using `ref` as visibility reference.
+ */
+void depth_carve(
+	cv::cuda::GpuMat &in,
+	const cv::cuda::GpuMat &ref,
+	//const cv::cuda::GpuMat &in_colour,
+	//const cv::cuda::GpuMat &ref_colour,
+	//cv::cuda::GpuMat &colour_scale,
+	const float4x4 &transform,
+	const ftl::rgbd::Camera &incam,
+	const ftl::rgbd::Camera &refcam,
+	cudaStream_t stream);
+
+void apply_colour_scaling(
+	const cv::cuda::GpuMat &scale,
+	cv::cuda::GpuMat &colour,
+	int radius,
+	cudaStream_t stream);
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..884fd0da46db2d61e68aba175603d7a04717f407
--- /dev/null
+++ b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp
@@ -0,0 +1,76 @@
+#ifndef _FTL_CUDA_MLS_MULTIINTENSITY_HPP_
+#define _FTL_CUDA_MLS_MULTIINTENSITY_HPP_
+
+#include <ftl/cuda_common.hpp>
+#include <ftl/rgbd/camera.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+
+namespace ftl {
+namespace cuda {
+
+// TODO: 4th channel of normals could be used as curvature estimate? Then can
+// adaptively adjust the smoothing amount by the degree of curvature. This
+// curvature value must be low for noise data, is only high if a consistent
+// high curvature is detected over a radius. Other option is to also or instead
+// use the curvature value as a feature, to only smooth against points with
+// similar curvature?
+
+/**
+ * For a particular viewpoint, use a set of other viewpoints to estimate a
+ * smooth surface for the focus viewpoint. This version of MLS uses absolute
+ * difference of some 8-bit value as a weight, this can be raw intensity or
+ * a local contrast measure. This `prime`, `gather`, `adjust` cycle should
+ * be iterated 2-3 times, but perhaps better to do one iteration of each view
+ * first before repeating a view.
+ * 
+ * Note: Have one instance per frameset because the internal buffers are
+ * allocated based upon frame image size.
+ */
+class MLSMultiIntensity
+{
+public:
+	MLSMultiIntensity(int radius);
+	~MLSMultiIntensity();
+
+	void prime(
+		const cv::cuda::GpuMat &depth_prime,
+		const cv::cuda::GpuMat &intensity_prime,
+		const ftl::rgbd::Camera &cam_prime,
+		const float4x4 &pose_prime,
+		cudaStream_t stream
+	);
+
+	void gatherPrime(float smoothing, cudaStream_t stream);
+
+	void gather(
+		const cv::cuda::GpuMat &depth_src,
+		const cv::cuda::GpuMat &normals_src,
+		const cv::cuda::GpuMat &intensity_src,
+		const ftl::rgbd::Camera &cam_src,
+		const float4x4 &pose_src,
+		float smoothing,
+		cudaStream_t stream
+	);
+
+	void adjust(
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &normals_out,
+		cudaStream_t stream
+	);
+
+private:
+	cv::cuda::GpuMat depth_prime_;
+	cv::cuda::GpuMat intensity_prime_;
+	ftl::rgbd::Camera cam_prime_;
+	float4x4 pose_prime_;
+	int radius_;
+
+	cv::cuda::GpuMat normal_accum_;
+	cv::cuda::GpuMat centroid_accum_;
+	cv::cuda::GpuMat weight_accum_;
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/components/operators/include/ftl/operators/cuda/mls_cuda.hpp b/components/operators/include/ftl/operators/cuda/mls_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9790595d6aa6350369f4c4cb88adb8a1728531ca
--- /dev/null
+++ b/components/operators/include/ftl/operators/cuda/mls_cuda.hpp
@@ -0,0 +1,60 @@
+#ifndef _FTL_CUDA_MLS_HPP_
+#define _FTL_CUDA_MLS_HPP_
+
+#include <ftl/rgbd/camera.hpp>
+#include <ftl/cuda_common.hpp>
+
+namespace ftl {
+namespace cuda {
+
+/**
+ * Basic Moving Least Squares smoothing on a single depth map. Outputs boths
+ * normals and depth values. This is a single iteration of an algorithm that
+ * should iterate 2-3 times. Normals can use some basic estimation as initial
+ * input since they are smoothed by MLS.
+ * 
+ * @param normals_in 4 channel 16-bit float (half).
+ * @param normals_out 4 channel 16-bit float (half).
+ * @param depth_in 1 channel 32-bit float
+ * @param depth_out 1 channel 32-bit float
+ * @param smoothing Gaussian radius in depth units
+ * @param radius Window radius for smoothing
+ * @param camera Camera intrinsics
+ * @param stream Optional CUDA stream
+ */
+void mls_smooth(
+	const cv::cuda::GpuMat &normals_in,
+	cv::cuda::GpuMat &normals_out,
+	const cv::cuda::GpuMat &depth_in,
+	cv::cuda::GpuMat &depth_out,
+	float smoothing,
+	int radius,
+	const ftl::rgbd::Camera &camera,
+	cudaStream_t stream);
+
+/**
+ * Basic Moving Least Squares smoothing on a single depth map. Outputs just the
+ * smoothed normals. This is a single iteration of an algorithm that should
+ * iterate 2-3 times.
+ * 
+ * @param normals_in 4 channel 16-bit float (half).
+ * @param normals_out 4 channel 16-bit float (half).
+ * @param depth_in 1 channel 32-bit float
+ * @param smoothing Gaussian radius in depth units
+ * @param radius Window radius for smoothing
+ * @param camera Camera intrinsics
+ * @param stream Optional CUDA stream
+ */
+void mls_smooth(
+	const cv::cuda::GpuMat &normals_in,
+	cv::cuda::GpuMat &normals_out,
+	const cv::cuda::GpuMat &depth_in,
+	float smoothing,
+	int radius,
+	const ftl::rgbd::Camera &camera,
+	cudaStream_t stream);
+
+}
+}
+
+#endif  // _FTL_CUDA_MLS_HPP_
diff --git a/components/operators/src/smoothing_cuda.hpp b/components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp
similarity index 91%
rename from components/operators/src/smoothing_cuda.hpp
rename to components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp
index c44c7787b9fe8dd18d6e148d96c8d71803ba368c..f1934cdae28f4a3a90cf007c3572f3da017b9357 100644
--- a/components/operators/src/smoothing_cuda.hpp
+++ b/components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp
@@ -7,16 +7,6 @@
 namespace ftl {
 namespace cuda {
 
-void mls_smooth(
-		ftl::cuda::TextureObject<half4> &normals_in,
-		ftl::cuda::TextureObject<half4> &normals_out,
-		ftl::cuda::TextureObject<float> &depth_in,
-		ftl::cuda::TextureObject<float> &depth_out,
-		float smoothing,
-		int radius,
-		const ftl::rgbd::Camera &camera,
-		cudaStream_t stream);
-
 void colour_mls_smooth(
 		ftl::cuda::TextureObject<half4> &normals_in,
 		ftl::cuda::TextureObject<half4> &normals_out,
diff --git a/components/operators/include/ftl/operators/fusion.hpp b/components/operators/include/ftl/operators/fusion.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e5f9cd324f1e57979d9f20fc9d9a98f796ce0
--- /dev/null
+++ b/components/operators/include/ftl/operators/fusion.hpp
@@ -0,0 +1,29 @@
+#ifndef _FTL_OPERATORS_FUSION_HPP_
+#define _FTL_OPERATORS_FUSION_HPP_
+
+#include <ftl/operators/operator.hpp>
+#include <ftl/operators/cuda/mls/multi_intensity.hpp>
+#include <vector>
+
+namespace ftl {
+namespace operators {
+
+class Fusion : public ftl::operators::Operator {
+	public:
+	Fusion(ftl::operators::Graph *g, ftl::Configurable*);
+	~Fusion();
+
+	inline Operator::Type type() const override { return Operator::Type::ManyToMany; }
+
+	bool apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) override;
+
+	private:
+	ftl::cuda::MLSMultiIntensity mls_;
+	std::vector<cv::cuda::GpuMat> weights_;
+	cv::cuda::GpuMat temp_;
+};
+
+}
+}
+
+#endif
diff --git a/components/operators/src/gt_analysis.cpp b/components/operators/src/analysis/evaluation/gt_analysis.cpp
similarity index 100%
rename from components/operators/src/gt_analysis.cpp
rename to components/operators/src/analysis/evaluation/gt_analysis.cpp
diff --git a/components/operators/src/gt_analysis.cu b/components/operators/src/analysis/evaluation/gt_analysis.cu
similarity index 100%
rename from components/operators/src/gt_analysis.cu
rename to components/operators/src/analysis/evaluation/gt_analysis.cu
diff --git a/components/operators/src/mask.cpp b/components/operators/src/analysis/local/mask.cpp
similarity index 100%
rename from components/operators/src/mask.cpp
rename to components/operators/src/analysis/local/mask.cpp
diff --git a/components/operators/src/mask.cu b/components/operators/src/analysis/local/mask.cu
similarity index 100%
rename from components/operators/src/mask.cu
rename to components/operators/src/analysis/local/mask.cu
diff --git a/components/operators/src/normals.cpp b/components/operators/src/analysis/local/normals.cpp
similarity index 100%
rename from components/operators/src/normals.cpp
rename to components/operators/src/analysis/local/normals.cpp
diff --git a/components/operators/src/smoothchan.cu b/components/operators/src/analysis/local/smoothchan.cu
similarity index 98%
rename from components/operators/src/smoothchan.cu
rename to components/operators/src/analysis/local/smoothchan.cu
index ea40854fccd7579fc3973a6acb41543ea7bc75d5..2766e6d23ab37f92e3a56bc0938a91df9ac45bd0 100644
--- a/components/operators/src/smoothchan.cu
+++ b/components/operators/src/analysis/local/smoothchan.cu
@@ -1,4 +1,4 @@
-#include "smoothing_cuda.hpp"
+#include <ftl/operators/cuda/smoothing_cuda.hpp>
 
 #include <ftl/cuda/weighting.hpp>
 
diff --git a/components/operators/src/weighting.cpp b/components/operators/src/analysis/local/weighting.cpp
similarity index 100%
rename from components/operators/src/weighting.cpp
rename to components/operators/src/analysis/local/weighting.cpp
diff --git a/components/operators/src/weighting.cu b/components/operators/src/analysis/local/weighting.cu
similarity index 100%
rename from components/operators/src/weighting.cu
rename to components/operators/src/analysis/local/weighting.cu
diff --git a/components/operators/src/weighting_cuda.hpp b/components/operators/src/analysis/local/weighting_cuda.hpp
similarity index 100%
rename from components/operators/src/weighting_cuda.hpp
rename to components/operators/src/analysis/local/weighting_cuda.hpp
diff --git a/components/operators/src/segmentation.cpp b/components/operators/src/analysis/segmentation/segmentation.cpp
similarity index 100%
rename from components/operators/src/segmentation.cpp
rename to components/operators/src/analysis/segmentation/segmentation.cpp
diff --git a/components/operators/src/segmentation.cu b/components/operators/src/analysis/segmentation/segmentation.cu
similarity index 100%
rename from components/operators/src/segmentation.cu
rename to components/operators/src/analysis/segmentation/segmentation.cu
diff --git a/components/operators/src/segmentation_cuda.hpp b/components/operators/src/analysis/segmentation/segmentation_cuda.hpp
similarity index 100%
rename from components/operators/src/segmentation_cuda.hpp
rename to components/operators/src/analysis/segmentation/segmentation_cuda.hpp
diff --git a/components/operators/src/aruco.cpp b/components/operators/src/analysis/tracking/aruco.cpp
similarity index 100%
rename from components/operators/src/aruco.cpp
rename to components/operators/src/analysis/tracking/aruco.cpp
diff --git a/components/operators/src/detectandtrack.cpp b/components/operators/src/analysis/tracking/detectandtrack.cpp
similarity index 100%
rename from components/operators/src/detectandtrack.cpp
rename to components/operators/src/analysis/tracking/detectandtrack.cpp
diff --git a/components/operators/src/nvopticalflow.cpp b/components/operators/src/analysis/tracking/nvopticalflow.cpp
similarity index 100%
rename from components/operators/src/nvopticalflow.cpp
rename to components/operators/src/analysis/tracking/nvopticalflow.cpp
diff --git a/components/operators/src/opticalflow.cu b/components/operators/src/analysis/tracking/opticalflow.cu
similarity index 100%
rename from components/operators/src/opticalflow.cu
rename to components/operators/src/analysis/tracking/opticalflow.cu
diff --git a/components/operators/src/opticalflow_cuda.hpp b/components/operators/src/analysis/tracking/opticalflow_cuda.hpp
similarity index 100%
rename from components/operators/src/opticalflow_cuda.hpp
rename to components/operators/src/analysis/tracking/opticalflow_cuda.hpp
diff --git a/components/operators/src/depth.cpp b/components/operators/src/disparity/depth.cpp
similarity index 100%
rename from components/operators/src/depth.cpp
rename to components/operators/src/disparity/depth.cpp
diff --git a/components/operators/src/fusion/carving/carver.cu b/components/operators/src/fusion/carving/carver.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d1dd480ef0cbbf2ea4a75595fd6af4fee712297d
--- /dev/null
+++ b/components/operators/src/fusion/carving/carver.cu
@@ -0,0 +1,245 @@
+#include <ftl/operators/cuda/carver.hpp>
+#include <cudatl/fixed.hpp>
+#include <ftl/cuda/weighting.hpp>
+
+__device__ inline float depthErrorCoef(const ftl::rgbd::Camera &cam, float disps=1.0f) {
+	return disps / (cam.baseline*cam.fx);
+}
+
+// ==== Reverse Verify Result ==================================================
+
+// No colour scale calculations
+/*__global__ void reverse_check_kernel(
+	float* __restrict__ depth_in,
+	const float* __restrict__ depth_original,
+	int pitch4,
+	int opitch4,
+	float4x4 transformR,
+	ftl::rgbd::Camera vintrin,
+	ftl::rgbd::Camera ointrin
+) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < 0 || x >= vintrin.width || y < 0 || y >= vintrin.height) return;
+
+	float d = depth_in[y*pitch4+x];
+
+	const float err_coef = 0.001f; //depthErrorCoef(ointrin);
+	
+	int count = 10;  // Allow max 2cm of carving.
+	while (--count >= 0) {
+		float3 campos = transformR * vintrin.screenToCam(x,y,d);
+		int2 spos = ointrin.camToScreen<int2>(campos);
+		int ox = spos.x;
+		int oy = spos.y;
+
+		if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) {
+			float d2 = depth_original[oy*opitch4+ox];
+
+			// TODO: Threshold comes from depth error characteristics
+			// If the value is significantly further then carve. Depth error
+			// is not always easy to calculate, depends on source.
+			if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*d2*err_coef)) break;
+
+			d += 0.002f;
+		} else break;
+	}
+
+	// Too much carving means just outright remove the point.
+	depth_in[y*pitch4+x] = (count < 0) ? 0.0f : d;
+}*/
+
+__global__ void reverse_check_kernel(
+	float* __restrict__ depth_in,
+	const float* __restrict__ depth_original,
+	//const uchar4* __restrict__ in_colour,
+	//const uchar4* __restrict__ ref_colour,
+	//int8_t* __restrict__ colour_scale,
+	int pitch4,
+	//int pitch,
+	int opitch4,
+	//int in_col_pitch4,
+	//int o_col_pitch4,
+	//int cwidth,
+	//int cheight,
+	float4x4 transformR,
+	ftl::rgbd::Camera vintrin,
+	ftl::rgbd::Camera ointrin
+) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < 0 || x >= vintrin.width || y < 0 || y >= vintrin.height) return;
+
+	float d = depth_in[y*pitch4+x];
+
+	// TODO: Externally provide the error coefficient
+	const float err_coef = 0.0005f; //depthErrorCoef(ointrin);
+
+	int ox = 0;
+	int oy = 0;
+
+	bool match = false;
+	
+	int count = 10;  // Allow max 2cm of carving.
+	while (--count >= 0) {
+		float3 campos = transformR * vintrin.screenToCam(x,y,d);
+		int2 spos = ointrin.camToScreen<int2>(campos);
+		ox = spos.x;
+		oy = spos.y;
+
+		if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) {
+			float d2 = depth_original[oy*opitch4+ox];
+
+			// TODO: Threshold comes from depth error characteristics
+			// If the value is significantly further then carve. Depth error
+			// is not always easy to calculate, depends on source.
+			if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*d2*err_coef)) {
+				match = fabsf(campos.z - d2) < d2*d2*err_coef; break;
+			}
+
+			d += 0.002f;  // TODO: Should this be += error or what?
+		} else break;
+	}
+
+	// We found a match, so do a colour check
+	//float idiff = 127.0f;
+	//if (match) {
+	/*	// Generate colour scaling
+		const float ximgscale = float(cwidth) / float(ointrin.width);
+		ox = float(ox) * ximgscale;
+		const float yimgscale = float(cheight) / float(ointrin.height);
+		oy = float(oy) * yimgscale;
+
+		int cy = float(y) * yimgscale;
+		int cx = float(x) * ximgscale;
+
+		const uchar4 vcol = in_colour[cy*in_col_pitch4+cx];
+		const uchar4 ocol = (match) ? ref_colour[oy*o_col_pitch4+ox] : vcol;
+
+		float i1 = (0.2126f*float(vcol.z) + 0.7152f*float(vcol.y) + 0.0722f*float(vcol.x));
+		float i2 = (0.2126f*float(ocol.z) + 0.7152f*float(ocol.y) + 0.0722f*float(ocol.x));
+		idiff = i2-i1;
+
+		//const float scaleX = (vcol.x == 0) ? 1.0f : float(ocol.x) / float(vcol.x);
+		//const float scaleY = (vcol.y == 0) ? 1.0f : float(ocol.y) / float(vcol.y);
+		//const float scaleZ = (vcol.z == 0) ? 1.0f : float(ocol.z) / float(vcol.z);
+		//scale = (0.2126f*scaleZ + 0.7152f*scaleY + 0.0722f*scaleX);
+	//}
+	colour_scale[x+pitch*y] = int8_t(max(-127.0f,min(127.0f,idiff)));*/
+
+	// Too much carving means just outright remove the point.
+	depth_in[y*pitch4+x] = (count < 0) ? 0.0f : d;
+}
+
+void ftl::cuda::depth_carve(
+	cv::cuda::GpuMat &depth_in,
+	const cv::cuda::GpuMat &depth_original,
+	//const cv::cuda::GpuMat &in_colour,
+	//const cv::cuda::GpuMat &ref_colour,
+	//cv::cuda::GpuMat &colour_scale,
+	const float4x4 &transformR,
+	const ftl::rgbd::Camera &vintrin,
+	const ftl::rgbd::Camera &ointrin,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 16;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	//colour_scale.create(depth_in.size(), CV_8U);
+
+	reverse_check_kernel<<<gridSize, blockSize, 0, stream>>>(
+		depth_in.ptr<float>(),
+		depth_original.ptr<float>(),
+		//in_colour.ptr<uchar4>(),
+		//ref_colour.ptr<uchar4>(),
+		//colour_scale.ptr<int8_t>(),
+		depth_in.step1(),
+		//colour_scale.step1(),
+		depth_original.step1(),
+		//in_colour.step1()/4,
+		//ref_colour.step1()/4,
+		//in_colour.cols,
+		//in_colour.rows,
+		transformR,
+		vintrin, ointrin);
+
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Apply colour scale =====================================================
+
+template <int RADIUS>
+__global__ void apply_colour_scaling_kernel(
+	const int8_t* __restrict__ scale,
+	uchar4* __restrict__ colour,
+	int spitch,
+	int cpitch,
+	int swidth,
+	int sheight,
+	int cwidth,
+	int cheight
+) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x >= 0 && x < cwidth && y >= 0 && y < cheight) {
+		int sx = (float(swidth) / float(cwidth)) * float(x);
+		int sy = (float(sheight) / float(cheight)) * float(y);
+
+		float s = 0.0f;
+		int count = 0;
+		//float mindiff = 100.0f;
+
+		for (int v=-RADIUS; v<=RADIUS; ++v) {
+			#pragma unroll
+			for (int u=-RADIUS; u<=RADIUS; ++u) {
+				float ns = (sx >= RADIUS && sy >= RADIUS && sx < swidth-RADIUS && sy < sheight-RADIUS) ? scale[sx+u+(sy+v)*spitch] : 0.0f;
+				if (fabsf(ns) < 30) {
+					s += ns;
+					++count;
+				}
+			}
+		}
+
+		if (count > 0) s /= float(count);
+
+		uchar4 c = colour[x+y*cpitch];
+		colour[x+y*cpitch] = make_uchar4(
+			max(0.0f, min(255.0f, float(c.x) + s)),
+			max(0.0f, min(255.0f, float(c.y) + s)),
+			max(0.0f, min(255.0f, float(c.z) + s)),
+			255.0f
+		);
+	}
+}
+
+void ftl::cuda::apply_colour_scaling(
+	const cv::cuda::GpuMat &scale,
+	cv::cuda::GpuMat &colour,
+	int radius,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 16;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((colour.cols + THREADS_X - 1)/THREADS_X, (colour.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	apply_colour_scaling_kernel<2><<<gridSize, blockSize, 0, stream>>>(
+		scale.ptr<int8_t>(),
+		colour.ptr<uchar4>(),
+		scale.step1(),
+		colour.step1()/4,
+		scale.cols,
+		scale.rows,
+		colour.cols,
+		colour.rows
+	);
+
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/fusion/fusion.cpp b/components/operators/src/fusion/fusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..334820a3a12d590bbfcdee51390b2fef1f6b41a7
--- /dev/null
+++ b/components/operators/src/fusion/fusion.cpp
@@ -0,0 +1,124 @@
+#include <ftl/operators/fusion.hpp>
+#include <ftl/operators/cuda/carver.hpp>
+#include <ftl/utility/matrix_conversion.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
+
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+
+using ftl::operators::Fusion;
+using ftl::codecs::Channel;
+using cv::cuda::GpuMat;
+
+Fusion::Fusion(ftl::operators::Graph *g, ftl::Configurable *cfg) : ftl::operators::Operator(g, cfg), mls_(3) {
+
+}
+
+Fusion::~Fusion() {
+
+}
+
+bool Fusion::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) {
+	float mls_smoothing = config()->value("mls_smoothing", 0.01f);
+	int mls_iters = config()->value("mls_iterations", 2);
+
+	if (weights_.size() != in.frames.size()) weights_.resize(in.frames.size());
+
+	cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+	for (size_t i=0; i<in.frames.size(); ++i) {
+		if (!in.hasFrame(i)) continue;
+		const GpuMat &col = in.frames[i].get<GpuMat>(Channel::Colour);
+		const GpuMat &d = in.frames[i].get<GpuMat>(Channel::Depth);
+
+		cv::cuda::cvtColor(col, temp_, cv::COLOR_BGRA2GRAY, 0, cvstream);
+		cv::cuda::resize(temp_, weights_[i], d.size(), 0, 0, cv::INTER_LINEAR, cvstream);
+	}
+
+	if (config()->value("visibility_carving", true)) {
+		for (size_t i=0; i < in.frames.size(); ++i) {
+			if (!in.hasFrame(i)) continue;
+
+			auto &f = in.frames[i].cast<ftl::rgbd::Frame>();
+
+			for (size_t j=0; j < in.frames.size(); ++j) {
+				if (i == j) continue;
+				if (!in.hasFrame(j)) continue;
+
+				auto &ref = in.frames[j].cast<ftl::rgbd::Frame>();
+
+				auto transformR = MatrixConversion::toCUDA(ref.getPose().cast<float>().inverse() * f.getPose().cast<float>());
+
+				ftl::cuda::depth_carve(
+					f.create<cv::cuda::GpuMat>(Channel::Depth),
+					ref.get<cv::cuda::GpuMat>(Channel::Depth),
+					//f.get<cv::cuda::GpuMat>(Channel::Colour),
+					//ref.get<cv::cuda::GpuMat>(Channel::Colour),
+					//colour_scale_,
+					transformR,
+					f.getLeft(),
+					ref.getLeft(),
+					stream
+				);
+			}
+
+			//ftl::cuda::apply_colour_scaling(colour_scale_, f.create<cv::cuda::GpuMat>(Channel::Colour), 3, stream_);
+		}
+	}
+
+	for (int iters=0; iters < mls_iters; ++iters) {
+	for (size_t i=0; i<in.frames.size(); ++i) {
+		if (!in.hasFrame(i)) continue;
+
+		auto &f1 = in.frames[i].cast<ftl::rgbd::Frame>();
+
+		Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+		d1 = f1.getPose() * d1;
+
+		auto pose1 = MatrixConversion::toCUDA(f1.getPose().cast<float>());
+
+		mls_.prime(
+			f1.get<GpuMat>(Channel::Depth),
+			weights_[i],
+			f1.getLeft(),
+			pose1,
+			stream
+		);
+
+		for (size_t j=0; j<in.frames.size(); ++j) {
+			if (!in.hasFrame(j)) continue;
+			//if (i == j) continue;
+
+			//LOG(INFO) << "Running phase1";
+
+			auto &f2 = in.frames[j].cast<ftl::rgbd::Frame>();
+
+			// Are cameras facing similar enough direction?
+			Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+			d2 = f2.getPose() * d2;
+			// No, so skip this combination
+			if (d1.dot(d2) <= 0.0) continue;
+
+			auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>());
+
+			mls_.gather(
+				f2.get<GpuMat>(Channel::Depth),
+				f2.get<GpuMat>(Channel::Normals),
+				weights_[j],
+				f2.getLeft(),
+				pose2,
+				mls_smoothing,
+				stream
+			);
+		}
+
+		mls_.adjust(
+			f1.create<GpuMat>(Channel::Depth),
+			f1.create<GpuMat>(Channel::Normals),
+			stream
+		);
+	}
+	}
+
+	return true;
+}
diff --git a/components/operators/src/fusion/mvmls.cpp b/components/operators/src/fusion/mvmls.cpp
index 38328f33ea724a1b687163bc961c2ccec1d6b4fa..15742b05544fea94846db4927ee0434a22c20967 100644
--- a/components/operators/src/fusion/mvmls.cpp
+++ b/components/operators/src/fusion/mvmls.cpp
@@ -1,5 +1,5 @@
 #include <ftl/operators/mvmls.hpp>
-#include "smoothing_cuda.hpp"
+#include <ftl/operators/cuda/smoothing_cuda.hpp>
 #include <ftl/utility/matrix_conversion.hpp>
 #include "mvmls_cuda.hpp"
 #include <ftl/cuda/normals.hpp>
diff --git a/components/operators/src/fusion/smoothing/mls_multi_weighted.cu b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5e5ded31f193bda1c5b3d24b58994d5717ead32
--- /dev/null
+++ b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu
@@ -0,0 +1,256 @@
+#include <ftl/operators/cuda/mls/multi_intensity.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#include <ftl/cuda/weighting.hpp>
+
+using ftl::cuda::MLSMultiIntensity;
+using cv::cuda::GpuMat;
+
+// ==== Multi image MLS ========================================================
+
+__device__ inline float featureWeight(int f1, int f2) {
+	const float w = (1.0f-(float(abs(f1 - f2)) / 255.0f));
+	return w*w;
+}
+
+/*
+ * Gather points for Moving Least Squares, from each source image
+ */
+ template <int SEARCH_RADIUS, typename T>
+ __global__ void mls_gather_intensity_kernel(
+	const half4* __restrict__ normals_in,
+	half4* __restrict__ normals_out,
+	const float* __restrict__ depth_origin,
+	const float* __restrict__ depth_in,
+	const T* __restrict__ feature_origin,
+	const T* __restrict__ feature_in,
+	float4* __restrict__ centroid_out,
+	float* __restrict__ contrib_out,
+	float smoothing,
+	float4x4 o_2_in,
+	float4x4 in_2_o,
+	float3x3 in_2_o33,
+	ftl::rgbd::Camera camera_origin,
+	ftl::rgbd::Camera camera_in,
+	int npitch_out,
+	int cpitch_out,
+	int wpitch_out,
+	int dpitch_o,
+	int dpitch_i,
+	int npitch_in,
+	int fpitch_o,
+	int fpitch_i
+) {        
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < 0 || y < 0 || x >= camera_origin.width || y >= camera_origin.height) return;
+
+	float3 nX = make_float3(normals_out[y*npitch_out+x]);
+	float3 aX = make_float3(centroid_out[y*cpitch_out+x]);
+    float contrib = contrib_out[y*wpitch_out+x];
+
+	float d0 = depth_origin[x+y*dpitch_o];
+	if (d0 <= camera_origin.minDepth || d0 >= camera_origin.maxDepth) return;
+
+	const int feature1 = feature_origin[x+y*fpitch_o];
+
+	float3 X = camera_origin.screenToCam((int)(x),(int)(y),d0);
+
+	int2 s = camera_in.camToScreen<int2>(o_2_in * X);
+
+    // Neighbourhood
+    for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
+    for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
+		const float d = (s.x+u >= 0 && s.x+u < camera_in.width && s.y+v >= 0 && s.y+v < camera_in.height) ? depth_in[s.x+u+(s.y+v)*dpitch_i] : 0.0f;
+		if (d <= camera_in.minDepth || d >= camera_in.maxDepth) continue;
+
+		// Point and normal of neighbour
+		const float3 Xi = in_2_o * camera_in.screenToCam(s.x+u, s.y+v, d);
+		const float3 Ni = make_float3(normals_in[s.x+u+(s.y+v)*npitch_in]);
+
+		const int feature2 = feature_in[s.x+y+(s.y+v)*fpitch_i];
+
+		// Gauss approx weighting function using point distance
+		const float w = (length(Ni) > 0.0f)
+			? ftl::cuda::spatialWeighting(X,Xi,smoothing) * featureWeight(feature1, feature2)
+			: 0.0f;
+
+		aX += Xi*w;
+		nX += (in_2_o33 * Ni)*w;
+		contrib += w;
+    }
+	}
+
+	normals_out[y*npitch_out+x] = make_half4(nX, 0.0f);
+	centroid_out[y*cpitch_out+x] = make_float4(aX, 0.0f);
+	contrib_out[y*wpitch_out+x] = contrib;
+}
+
+/**
+ * Convert accumulated values into estimate of depth and normals at pixel.
+ */
+ __global__ void mls_reduce_kernel_2(
+	const float4* __restrict__ centroid,
+	const half4* __restrict__ normals,
+	const float* __restrict__ contrib_out,
+	half4* __restrict__ normals_out,
+	float* __restrict__ depth,
+	ftl::rgbd::Camera camera,
+	int npitch_in,
+	int cpitch_in,
+	int wpitch,
+	int npitch,
+	int dpitch
+) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
+		float3 nX = make_float3(normals[y*npitch_in+x]);
+		float3 aX = make_float3(centroid[y*cpitch_in+x]);
+		float contrib = contrib_out[y*wpitch+x];
+
+		//depth[x+y*dpitch] = X.z;
+		normals_out[x+y*npitch] = make_half4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		float d0 = depth[x+y*dpitch];
+		//depth[x+y*dpitch] = 0.0f;
+		if (d0 <= camera.minDepth || d0 >= camera.maxDepth || contrib == 0.0f) return;
+		float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+		
+		nX /= contrib;  // Weighted average normal
+		aX /= contrib;  // Weighted average point (centroid)
+
+		// Signed-Distance Field function
+		float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
+
+		// Calculate new point using SDF function to adjust depth (and position)
+		X = X - nX * fX;
+
+		depth[x+y*dpitch] = X.z;
+		normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f);
+	}
+}
+
+
+MLSMultiIntensity::MLSMultiIntensity(int radius)
+	: radius_(radius)
+{
+
+}
+
+MLSMultiIntensity::~MLSMultiIntensity()
+{
+
+}
+
+void MLSMultiIntensity::prime(
+	const GpuMat &depth_prime,
+	const GpuMat &intensity_prime,
+	const ftl::rgbd::Camera &cam_prime,
+	const float4x4 &pose_prime,
+	cudaStream_t stream)
+{
+	depth_prime_ = depth_prime;
+	intensity_prime_ = intensity_prime;
+	cam_prime_ = cam_prime;
+	pose_prime_ = pose_prime;
+
+	centroid_accum_.create(depth_prime.size(), CV_32FC4);
+	normal_accum_.create(depth_prime.size(), CV_16FC4);
+	weight_accum_.create(depth_prime.size(), CV_32F);
+
+	cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+	// Reset buffers
+	centroid_accum_.setTo(cv::Scalar(0,0,0,0), cvstream);
+	weight_accum_.setTo(cv::Scalar(0), cvstream);
+	normal_accum_.setTo(cv::Scalar(0,0,0,0), cvstream);
+}
+
+void MLSMultiIntensity::gatherPrime(float smoothing, cudaStream_t stream)
+{
+	// Can use a simpler kernel without pose transformations
+}
+
+void MLSMultiIntensity::gather(
+	const GpuMat &depth_src,
+	const GpuMat &normals_src,
+	const GpuMat &intensity_src,
+	const ftl::rgbd::Camera &cam_src,
+	const float4x4 &pose_src,
+	float smoothing,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	float4x4 inv_pose_src = pose_src;
+	inv_pose_src.invert();
+	float4x4 o_2_in = inv_pose_src * pose_prime_;
+	float4x4 inv_pose_prime = pose_prime_;
+	inv_pose_prime.invert();
+	float4x4 in_2_o = inv_pose_prime * pose_src;
+	float3x3 in_2_o33 = inv_pose_prime.getFloat3x3() * pose_src.getFloat3x3();
+
+	mls_gather_intensity_kernel<3><<<gridSize, blockSize, 0, stream>>>(
+		normals_src.ptr<half4>(),
+		normal_accum_.ptr<half4>(),
+		depth_prime_.ptr<float>(),
+		depth_src.ptr<float>(),
+		intensity_prime_.ptr<uchar>(),
+		intensity_src.ptr<uchar>(),
+		centroid_accum_.ptr<float4>(),
+		weight_accum_.ptr<float>(),
+		smoothing,
+		o_2_in,
+		in_2_o,
+		in_2_o33,
+		cam_prime_,
+		cam_src,
+		normal_accum_.step1()/4,
+		centroid_accum_.step1()/4,
+		weight_accum_.step1(),
+		depth_prime_.step1(),
+		depth_src.step1(),
+		normals_src.step1()/4,
+		intensity_prime_.step1(),
+		intensity_src.step1()
+	);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+void MLSMultiIntensity::adjust(
+	GpuMat &depth_out,
+	GpuMat &normals_out,
+	cudaStream_t stream)
+{
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	normals_out.create(depth_prime_.size(), CV_16FC4);
+	depth_out.create(depth_prime_.size(), CV_32F);
+
+	// FIXME: Depth prime assumed to be same as depth out
+
+	mls_reduce_kernel_2<<<gridSize, blockSize, 0, stream>>>(
+		centroid_accum_.ptr<float4>(),
+		normal_accum_.ptr<half4>(),
+		weight_accum_.ptr<float>(),
+		normals_out.ptr<half4>(),
+		depth_prime_.ptr<float>(),
+		cam_prime_,
+		normal_accum_.step1()/4,
+		centroid_accum_.step1()/4,
+		weight_accum_.step1(),
+		normals_out.step1()/4,
+		depth_prime_.step1()
+	);
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/antialiasing.cpp b/components/operators/src/misc/antialiasing.cpp
similarity index 100%
rename from components/operators/src/antialiasing.cpp
rename to components/operators/src/misc/antialiasing.cpp
diff --git a/components/operators/src/antialiasing.cu b/components/operators/src/misc/antialiasing.cu
similarity index 100%
rename from components/operators/src/antialiasing.cu
rename to components/operators/src/misc/antialiasing.cu
diff --git a/components/operators/src/antialiasing_cuda.hpp b/components/operators/src/misc/antialiasing_cuda.hpp
similarity index 100%
rename from components/operators/src/antialiasing_cuda.hpp
rename to components/operators/src/misc/antialiasing_cuda.hpp
diff --git a/components/operators/src/clipping.cpp b/components/operators/src/misc/clipping.cpp
similarity index 100%
rename from components/operators/src/clipping.cpp
rename to components/operators/src/misc/clipping.cpp
diff --git a/components/operators/src/colours.cpp b/components/operators/src/misc/colours.cpp
similarity index 100%
rename from components/operators/src/colours.cpp
rename to components/operators/src/misc/colours.cpp
diff --git a/components/operators/src/poser.cpp b/components/operators/src/misc/poser.cpp
similarity index 100%
rename from components/operators/src/poser.cpp
rename to components/operators/src/misc/poser.cpp
diff --git a/components/operators/src/filling.cpp b/components/operators/src/surface/filling.cpp
similarity index 100%
rename from components/operators/src/filling.cpp
rename to components/operators/src/surface/filling.cpp
diff --git a/components/operators/src/filling.cu b/components/operators/src/surface/filling.cu
similarity index 100%
rename from components/operators/src/filling.cu
rename to components/operators/src/surface/filling.cu
diff --git a/components/operators/src/filling_cuda.hpp b/components/operators/src/surface/filling_cuda.hpp
similarity index 100%
rename from components/operators/src/filling_cuda.hpp
rename to components/operators/src/surface/filling_cuda.hpp
diff --git a/components/operators/src/mls.cu b/components/operators/src/surface/mls.cu
similarity index 89%
rename from components/operators/src/mls.cu
rename to components/operators/src/surface/mls.cu
index 55813446ab7a62d0913b96d107187b2bb62f0543..0a6373719c5d4655db9d9a2f7bacbadcea63e72c 100644
--- a/components/operators/src/mls.cu
+++ b/components/operators/src/surface/mls.cu
@@ -1,4 +1,4 @@
-#include "smoothing_cuda.hpp"
+#include <ftl/operators/cuda/smoothing_cuda.hpp>
 
 #include <ftl/cuda/weighting.hpp>
 
@@ -7,98 +7,6 @@ using ftl::cuda::TextureObject;
 #define T_PER_BLOCK 8
 #define WARP_SIZE 32
 
-// ===== MLS Smooth ============================================================
-
-/*
- * Smooth depth map using Moving Least Squares
- */
- template <int SEARCH_RADIUS>
- __global__ void mls_smooth_kernel(
-		TextureObject<half4> normals_in,
-		TextureObject<half4> normals_out,
-        TextureObject<float> depth_in,        // Virtual depth map
-		TextureObject<float> depth_out,   // Accumulated output
-		float smoothing,
-        ftl::rgbd::Camera camera) {
-        
-    const int x = blockIdx.x*blockDim.x + threadIdx.x;
-    const int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
-
-	float3 aX = make_float3(0.0f,0.0f,0.0f);
-	float3 nX = make_float3(0.0f,0.0f,0.0f);
-    float contrib = 0.0f;
-
-	float d0 = depth_in.tex2D(x, y);
-	depth_out(x,y) = d0;
-	if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
-	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
-
-    // Neighbourhood
-    for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
-    for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
-		const float d = depth_in.tex2D(x+u, y+v);
-		if (d < camera.minDepth || d > camera.maxDepth) continue;
-
-		// Point and normal of neighbour
-		const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
-		const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
-
-		// Gauss approx weighting function using point distance
-		const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing);
-
-		aX += Xi*w;
-		nX += Ni*w;
-		contrib += w;
-    }
-	}
-	
-	nX /= contrib;  // Weighted average normal
-	aX /= contrib;  // Weighted average point (centroid)
-
-	// Signed-Distance Field function
-	float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
-
-	// Calculate new point using SDF function to adjust depth (and position)
-	X = X - nX * fX;
-	
-	//uint2 screen = camera.camToScreen<uint2>(X);
-
-    //if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
-    //    depth_out(screen.x,screen.y) = X.z;
-	//}
-	depth_out(x,y) = X.z;
-	normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
-}
-
-void ftl::cuda::mls_smooth(
-		ftl::cuda::TextureObject<half4> &normals_in,
-		ftl::cuda::TextureObject<half4> &normals_out,
-		ftl::cuda::TextureObject<float> &depth_in,
-		ftl::cuda::TextureObject<float> &depth_out,
-		float smoothing,
-		int radius,
-		const ftl::rgbd::Camera &camera,
-		cudaStream_t stream) {
-
-	const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	switch (radius) {
-		case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
-		case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
-		case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
-		case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
-		case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
-	}
-	cudaSafeCall( cudaGetLastError() );
-
-
-	#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-	#endif
-}
 
 
 // ===== Colour MLS Smooth =====================================================
diff --git a/components/operators/src/surface/mls/image_basic.cu b/components/operators/src/surface/mls/image_basic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17d528391f1ca69ea6d92a24346be5c2590f1e98
--- /dev/null
+++ b/components/operators/src/surface/mls/image_basic.cu
@@ -0,0 +1,128 @@
+#include <ftl/operators/cuda/mls_cuda.hpp>
+#include <ftl/cuda/weighting.hpp>
+
+// ===== MLS Smooth ============================================================
+
+/*
+ * Smooth depth map using Moving Least Squares. This version is for a single
+ * depth image and does not use colour.
+ */
+ template <int RADIUS>
+ __global__ void mls_smooth_kernel(
+	const half4* __restrict__ normals_in,
+	half4* __restrict__ normals_out,	// Can be nullptr
+	const float* __restrict__ depth_in,
+	float* __restrict__ depth_out,		// Can be nullptr
+	int npitch,
+	int dpitch,
+	float smoothing,					// Radius of Gaussian in cm
+	ftl::rgbd::Camera camera
+) {	 
+   
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < RADIUS || y < RADIUS || x >= camera.width-RADIUS || y >= camera.height-RADIUS) return;
+
+	float3 aX = make_float3(0.0f,0.0f,0.0f);
+	float3 nX = make_float3(0.0f,0.0f,0.0f);
+    float contrib = 0.0f;
+
+	const float d0 = depth_in[x+y*dpitch];
+	if (depth_out) depth_out[x+y*dpitch] = d0;
+	if (normals_out) normals_out[x+y*npitch] = normals_in[x+y*npitch];
+	if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
+
+	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+
+    // Neighbourhood
+    for (int v=-RADIUS; v<=RADIUS; ++v) {
+    for (int u=-RADIUS; u<=RADIUS; ++u) {
+		const float d = depth_in[x+u+(y+v)*dpitch];
+		if (d < camera.minDepth || d > camera.maxDepth) continue;
+
+		// Point and normal of neighbour
+		const float3 Xi = camera.screenToCam(x+u, y+v, d);
+		const float3 Ni = make_float3(normals_in[x+u+(y+v)*npitch]);
+
+		// Gauss approx weighting function using point distance
+		const float w = (Ni.x+Ni.y+Ni.z == 0.0f) ? 0.0f : ftl::cuda::spatialWeighting(X,Xi,smoothing);
+
+		aX += Xi*w;
+		nX += Ni*w;
+		contrib += w;
+    }
+	}
+	
+	if (contrib > 0.0f) {
+		nX /= contrib;  // Weighted average normal
+		aX /= contrib;  // Weighted average point (centroid)
+
+		// Signed-Distance Field function
+		float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
+
+		// Calculate new point using SDF function to adjust depth (and position)
+		X = X - nX * fX;
+	
+		if (depth_out) depth_out[x+y*dpitch] = X.z;
+		if (normals_out) normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f);
+	}
+}
+
+/* One iteration of MLS Smoothing, simple, single image and output depth also. */
+void ftl::cuda::mls_smooth(
+	const cv::cuda::GpuMat &normals_in,
+	cv::cuda::GpuMat &normals_out,
+	const cv::cuda::GpuMat &depth_in,
+	cv::cuda::GpuMat &depth_out,
+	float smoothing,
+	int radius,
+	const ftl::rgbd::Camera &camera,
+	cudaStream_t stream
+) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	normals_out.create(normals_in.size(), CV_16FC4);
+	depth_out.create(depth_in.size(), CV_32F);
+
+	switch (radius) {
+		case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+	}
+	cudaSafeCall( cudaGetLastError() );
+}
+
+/* One iteration of MLS Smoothing, simple, single image, normals only */
+void ftl::cuda::mls_smooth(
+	const cv::cuda::GpuMat &normals_in,
+	cv::cuda::GpuMat &normals_out,
+	const cv::cuda::GpuMat &depth_in,
+	float smoothing,
+	int radius,
+	const ftl::rgbd::Camera &camera,
+	cudaStream_t stream
+) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	normals_out.create(normals_in.size(), CV_16FC4);
+
+	switch (radius) {
+		case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+		case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break;
+	}
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/smoothing.cpp b/components/operators/src/surface/smoothing.cpp
similarity index 97%
rename from components/operators/src/smoothing.cpp
rename to components/operators/src/surface/smoothing.cpp
index 347b6b7754b49363be98578a94e1d049b6ffec1a..dd96fe6fcfaafac8c48b3d17af649c07257ca87a 100644
--- a/components/operators/src/smoothing.cpp
+++ b/components/operators/src/surface/smoothing.cpp
@@ -1,5 +1,6 @@
 #include <ftl/operators/smoothing.hpp>
-#include "smoothing_cuda.hpp"
+#include <ftl/operators/cuda/smoothing_cuda.hpp>
+#include <ftl/operators/cuda/mls_cuda.hpp>
 
 #define LOGURU_REPLACE_GLOG 1
 #include <loguru.hpp>
@@ -163,10 +164,10 @@ bool SimpleMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t
 	// FIXME: Assume in and out are the same frame.
 	for (int i=0; i<iters; ++i) {
 		ftl::cuda::mls_smooth(
-			in.createTexture<half4>(Channel::Normals),
-			temp.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
-			in.createTexture<float>(Channel::Depth),
-			temp.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			in.get<GpuMat>(Channel::Normals),
+			temp.create<GpuMat>(Channel::Normals),
+			in.get<GpuMat>(Channel::Depth),
+			temp.create<GpuMat>(Channel::Depth),
 			thresh,
 			radius,
 			in.getLeftCamera(),
diff --git a/components/operators/src/smoothing.cu b/components/operators/src/surface/smoothing.cu
similarity index 99%
rename from components/operators/src/smoothing.cu
rename to components/operators/src/surface/smoothing.cu
index a2b7377fba011243c08cf5347f3dea4800b8ab51..a1184475317a174e65c30ea087a4922f20e3ccb0 100644
--- a/components/operators/src/smoothing.cu
+++ b/components/operators/src/surface/smoothing.cu
@@ -1,4 +1,4 @@
-#include "smoothing_cuda.hpp"
+#include <ftl/operators/cuda/smoothing_cuda.hpp>
 
 #include <ftl/cuda/weighting.hpp>
 
diff --git a/components/renderers/cpp/CMakeLists.txt b/components/renderers/cpp/CMakeLists.txt
index 5346849e22b23a31471ca6b3a1b52dc8eb37209d..fe93b78bb84000a5a499eda8dd136a5f7c153fae 100644
--- a/components/renderers/cpp/CMakeLists.txt
+++ b/components/renderers/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ target_include_directories(ftlrender PUBLIC
 	$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/ext/nanogui/include>
 	$<INSTALL_INTERFACE:include>
 	PRIVATE src)
-target_link_libraries(ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS})
+target_link_libraries(ftlrender ftlrgbd ftlcommon cudatl Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS})
 
 target_precompile_headers(ftlrender REUSE_FROM ftldata)
 
diff --git a/components/renderers/cpp/include/ftl/render/CUDARender.hpp b/components/renderers/cpp/include/ftl/render/CUDARender.hpp
index a4e65bce6ba1aa40dbce3c4a8ba24747f258300a..59eb1377685af76a5eed1d78314dd5c9c76b698d 100644
--- a/components/renderers/cpp/include/ftl/render/CUDARender.hpp
+++ b/components/renderers/cpp/include/ftl/render/CUDARender.hpp
@@ -58,9 +58,13 @@ class CUDARender : public ftl::render::FSRenderer {
 	ftl::cuda::TextureObject<float4> accum_;		// 2 is number of channels can render together
 	ftl::cuda::TextureObject<int> contrib_;
 	//ftl::cuda::TextureObject<half4> normals_;
+	cv::cuda::GpuMat colour_scale_;
+	cv::cuda::GpuMat mls_contrib_;
+	cv::cuda::GpuMat mls_centroid_;
+	cv::cuda::GpuMat mls_normals_;
 
-	std::list<ftl::cuda::TextureObject<short2>*> screen_buffers_;
-	std::list<ftl::cuda::TextureObject<float>*> depth_buffers_;
+	std::list<cv::cuda::GpuMat*> screen_buffers_;
+	std::list<cv::cuda::GpuMat*> depth_buffers_;
 	ftl::cuda::TextureObject<float> depth_out_;
 
 	ftl::cuda::Collision *collisions_;
@@ -118,8 +122,8 @@ class CUDARender : public ftl::render::FSRenderer {
 	bool _alreadySeen() const { return last_frame_ == scene_->timestamp(); }
 	void _adjustDepthThresholds(const ftl::rgbd::Camera &fcam);
 
-	ftl::cuda::TextureObject<float> &_getDepthBuffer(const cv::Size &);
-	ftl::cuda::TextureObject<short2> &_getScreenBuffer(const cv::Size &);
+	cv::cuda::GpuMat &_getDepthBuffer(const cv::Size &);
+	cv::cuda::GpuMat &_getScreenBuffer(const cv::Size &);
 
 	inline ftl::codecs::Channel _getDepthChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Depth : ftl::codecs::Channel::Depth2; }
 	inline ftl::codecs::Channel _getNormalsChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Normals : ftl::codecs::Channel::Normals2; }
diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp
index dab897d5696f78ebffbec097485778bebd57c03b..0f888dec2b9b75b1de77c791f9295d1563c113b5 100644
--- a/components/renderers/cpp/src/CUDARender.cpp
+++ b/components/renderers/cpp/src/CUDARender.cpp
@@ -6,6 +6,8 @@
 #include <ftl/operators/cuda/mask.hpp>
 #include <ftl/render/colouriser.hpp>
 #include <ftl/cuda/transform.hpp>
+#include <ftl/operators/cuda/mls_cuda.hpp>
+#include <ftl/utility/image_debug.hpp>
 
 #include <ftl/cuda/colour_cuda.hpp>
 
@@ -218,20 +220,20 @@ void CUDARender::_adjustDepthThresholds(const ftl::rgbd::Camera &fcam) {
 	params_.depthCoef = fcam.baseline*fcam.fx;
 }
 
-ftl::cuda::TextureObject<float> &CUDARender::_getDepthBuffer(const cv::Size &size) {
+cv::cuda::GpuMat &CUDARender::_getDepthBuffer(const cv::Size &size) {
 	for (auto *b : depth_buffers_) {
-		if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b;
+		if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b;
 	}
-	auto *nb = new ftl::cuda::TextureObject<float>(size.width, size.height);
+	auto *nb = new cv::cuda::GpuMat(size, CV_16S);
 	depth_buffers_.push_back(nb);
 	return *nb;
 }
 
-ftl::cuda::TextureObject<short2> &CUDARender::_getScreenBuffer(const cv::Size &size) {
+cv::cuda::GpuMat &CUDARender::_getScreenBuffer(const cv::Size &size) {
 	for (auto *b : screen_buffers_) {
-		if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b;
+		if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b;
 	}
-	auto *nb = new ftl::cuda::TextureObject<short2>(size.width, size.height);
+	auto *nb = new cv::cuda::GpuMat(size, CV_16SC2);
 	screen_buffers_.push_back(nb);
 	return *nb;
 }
@@ -241,18 +243,20 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 
 	bool do_blend = value("mesh_blend", false);
 	float blend_alpha = value("blend_alpha", 0.02f);
-	if (do_blend) {
-		temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream);
-		temp_.set<GpuMat>(Channel::Weights).setTo(cv::Scalar(0.0f), cvstream);
-	} else {
+	//if (do_blend) {
+	//	temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream);
+	//	temp_.set<GpuMat>(Channel::Weights).setTo(cv::Scalar(0.0f), cvstream);
+	//} else {
 		temp_.set<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0x7FFFFFFF), cvstream);
-	}
+	//}
 
 	int valid_count = 0;
 
 	// FIXME: Is it possible to remember previously if there should be depth?
 	bool use_depth = scene_->anyHasChannel(Channel::Depth) || scene_->anyHasChannel(Channel::GroundTruth);
 
+	//if (!colour_scale_.empty()) ftl::utility::show_image(colour_scale_, "CScale", 1.0f, ftl::utility::ImageVisualisation::HEAT_MAPPED);
+
 	// For each source depth map
 	for (size_t i=0; i < scene_->frames.size(); ++i) {
 		//if (!scene_->hasFrame(i)) continue;
@@ -284,14 +288,14 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		if (use_depth) {
 			if (f.hasChannel(Channel::Depth)) {
 				ftl::cuda::screen_coord(
-					f.createTexture<float>(Channel::Depth),
+					f.get<cv::cuda::GpuMat>(Channel::Depth),
 					depthbuffer,
 					screenbuffer,
 					params_, transform, f.getLeftCamera(), stream
 				);
 			} else if (f.hasChannel(Channel::GroundTruth)) {
 				ftl::cuda::screen_coord(
-					f.createTexture<float>(Channel::GroundTruth),
+					f.get<cv::cuda::GpuMat>(Channel::GroundTruth),
 					depthbuffer,
 					screenbuffer,
 					params_, transform, f.getLeftCamera(), stream
@@ -307,16 +311,16 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		}
 
 		// Must reset depth channel if blending
-		if (do_blend) {
-			temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream);
-		}
+		//if (do_blend) {
+		//	temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream);
+		//}
 
 		depth_out_.to_gpumat().setTo(cv::Scalar(1000.0f), cvstream);
 
 		// Decide on and render triangles around each point
 		ftl::cuda::triangle_render1(
 			depthbuffer,
-			temp_.createTexture<int>((do_blend) ? Channel::Depth : Channel::Depth2),
+			temp_.create<cv::cuda::GpuMat>((do_blend) ? Channel::Depth : Channel::Depth2),
 			screenbuffer,
 			params_, stream
 		);
@@ -324,7 +328,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		// TODO: Reproject here
 		// And merge based upon weight adjusted distances
 
-		if (do_blend) {
+		/*if (do_blend) {
 			// Blend this sources mesh with previous meshes
 			ftl::cuda::mesh_blender(
 				temp_.getTexture<int>(Channel::Depth),
@@ -338,7 +342,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 				blend_alpha,
 				stream
 			);
-		}
+		}*/
 	}
 
 	if (valid_count == 0) return;
@@ -346,7 +350,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 	// Convert from int depth to float depth
 	//temp_.get<GpuMat>(Channel::Depth2).convertTo(out.get<GpuMat>(Channel::Depth), CV_32F, 1.0f / 100000.0f, cvstream);
 
-	if (do_blend) {
+	/*if (do_blend) {
 		ftl::cuda::dibr_normalise(
 			//out.getTexture<float>(_getDepthChannel()),
 			//out.getTexture<float>(_getDepthChannel()),
@@ -355,23 +359,69 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 			temp_.getTexture<float>(Channel::Weights),
 			stream_
 		);
-	} else {
+	} else {*/
 		//ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), out.createTexture<float>(_getDepthChannel()), 1.0f / 100000.0f, stream_);
-		ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_);
-	}
+		ftl::cuda::merge_convert_depth(temp_.createTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_);
+	//}
 
 	// Now merge new render to any existing frameset render, detecting collisions
 	ftl::cuda::touch_merge(depth_out_, out.createTexture<float>(_getDepthChannel()), collisions_, 1024, touch_dist_, stream_);
 
-	//filters_->filter(out, src, stream);
+	// Generate actual depth map using MLS with mesh as estimate
+	float mls_smoothing = value("mls_smooth", 0.05f);
+	int mls_iter = value("mls_iter", 3);
+	int mls_radius = value("mls_radius", 0);
 
-	// Generate normals for final virtual image
-	ftl::cuda::normals(
+	if (mls_radius == 0) {
+		ftl::cuda::normals(
+			out.createTexture<half4>(_getNormalsChannel()),
+			out.getTexture<float>(_getDepthChannel()),
+			params_.camera, stream_);
+	} else {
+		ftl::cuda::normals(
+			temp_.createTexture<half4>(Channel::Normals),
+			out.getTexture<float>(_getDepthChannel()),
+			params_.camera, stream_);
+
+		ftl::cuda::mls_smooth(
+			temp_.get<cv::cuda::GpuMat>(Channel::Normals),
+			out.create<cv::cuda::GpuMat>(_getNormalsChannel()),
+			out.get<cv::cuda::GpuMat>(_getDepthChannel()),
+			//out.getTexture<float>(_getDepthChannel()),
+			value("mls_smooth", 0.01f),
+			value("mls_radius", 2),
+			params_.camera,
+			stream_
+		);
+
+		/*ftl::cuda::mls_smooth(
+			out.createTexture<half4>(_getNormalsChannel()),
+			temp_.createTexture<half4>(Channel::Normals),
+			out.getTexture<float>(_getDepthChannel()),
+			//out.getTexture<float>(_getDepthChannel()),
+			value("mls_smooth", 0.01f),
+			value("mls_radius", 2),
+			params_.camera,
+			stream_
+		);
+
+		ftl::cuda::mls_smooth(
+			temp_.createTexture<half4>(Channel::Normals),
+			out.createTexture<half4>(_getNormalsChannel()),
+			out.getTexture<float>(_getDepthChannel()),
+			//out.getTexture<float>(_getDepthChannel()),
+			value("mls_smooth", 0.01f),
+			value("mls_radius", 2),
+			params_.camera,
+			stream_
+		);*/
+	}
+
+	ftl::cuda::transform_normals(
 		out.createTexture<half4>(_getNormalsChannel()),
-		temp_.createTexture<half4>(Channel::Normals),
-		out.getTexture<float>(_getDepthChannel()),
-		value("normal_radius", 1), value("normal_smoothing", 0.02f),
-		params_.camera, pose_.getFloat3x3(), poseInverse_.getFloat3x3(), stream_);
+		poseInverse_.getFloat3x3(),
+		stream_
+	);
 }
 
 void CUDARender::_allocateChannels(ftl::rgbd::Frame &out, ftl::codecs::Channel chan) {
diff --git a/components/renderers/cpp/src/colouriser.cpp b/components/renderers/cpp/src/colouriser.cpp
index a8399d016b3229fe04d009d49ca9b902f4e6e68e..2412aeb86addf8bd1c558f421a7e9e7230f20a27 100644
--- a/components/renderers/cpp/src/colouriser.cpp
+++ b/components/renderers/cpp/src/colouriser.cpp
@@ -143,7 +143,7 @@ TextureObject<uchar4> &Colouriser::_processNormals(ftl::rgbd::Frame &f, Channel
 	light_diffuse.w = value("alpha", 0.5f)*255.0f;
 	light_ambient.w = light_diffuse.w;
 
-	auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", 1.0f));
+	auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", -1.0f));
 
 	ftl::cuda::normal_visualise(f.createTexture<half4>(c), buf,
 			light_pos,
diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu
index 15473b9ad4781957b1b9f1c6949469db1c6d230c..3b3e4d3bf575f3d027b3edc36114cbd2eb3ae3c4 100644
--- a/components/renderers/cpp/src/screen.cu
+++ b/components/renderers/cpp/src/screen.cu
@@ -2,6 +2,7 @@
 #include "splatter_cuda.hpp"
 #include <ftl/rgbd/camera.hpp>
 #include <ftl/cuda_common.hpp>
+#include <cudatl/fixed.hpp>
 
 using ftl::rgbd::Camera;
 using ftl::cuda::TextureObject;
@@ -34,29 +35,23 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters
 	return make_uint2(params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos)));
 }
 
-/*template <>
-__device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters &params, const float3 &camPos) {
-	float2 pt =  params.camera.camToScreen<float2>(camPos); //params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos));
-	const float coeff = 1.0f / (params.viewport.warpMatrix.entries[6] * pt.x + params.viewport.warpMatrix.entries[7] * pt.y + params.viewport.warpMatrix.entries[8]);
-	const float xcoo = coeff * (params.viewport.warpMatrix.entries[0] * pt.x + params.viewport.warpMatrix.entries[1] * pt.y + params.viewport.warpMatrix.entries[2]);
-	const float ycoo = coeff * (params.viewport.warpMatrix.entries[3] * pt.x + params.viewport.warpMatrix.entries[4] * pt.y + params.viewport.warpMatrix.entries[5]);
-	return make_uint2(xcoo, ycoo);
-}*/
-
 /*
  * Convert source screen position to output screen coordinates.
  */
  template <ftl::render::ViewPortMode VPMODE, Projection PROJECT>
- __global__ void screen_coord_kernel(TextureObject<float> depth,
-        TextureObject<float> depth_out,
-		TextureObject<short2> screen_out, Parameters params, float4x4 pose, Camera camera) {
+ __global__ void screen_coord_kernel(
+	const float* __restrict__ depth,
+	short* __restrict__ depth_out,
+	short2* __restrict__ screen_out,
+    int pitch4, int pitch2, Parameters params, float4x4 pose, Camera camera)
+{
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	if (x >= 0 && y >= 0 && x < depth.width() && y < depth.height()) {
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
 		//uint2 screenPos = make_uint2(30000,30000);
 
-		const float d = depth.tex2D(x, y);
+		const float d = depth[y*pitch4+x];
 
 		// Find the virtual screen position of current point
 		const float3 camPos =  (d > camera.minDepth && d < camera.maxDepth) ? pose * camera.screenToCam(x,y,d) : make_float3(0.0f,0.0f,0.0f);
@@ -69,34 +64,42 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters
 				screenPos.x >= params.camera.width ||
 				screenPos.y >= params.camera.height)
 			screenPos = make_float3(30000,30000,0);
-		screen_out(x,y) = make_short2(screenPos.x, screenPos.y);
-		depth_out(x,y) = screenPos.z;
+
+		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
+		depth_out[y*pitch2+x] = cudatl::float2fixed<10>(screenPos.z);
 	}
 }
 
-void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &depth_out,
-		TextureObject<short2> &screen_out, const Parameters &params,
+void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out, const Parameters &params,
 		const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
-    const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+    const dim3 gridSize((depth.cols + THREADS_X - 1)/THREADS_X, (depth.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+	
+	depth_out.create(depth.size(), CV_16S);
+	screen_out.create(depth.size(), CV_16SC2);
 
 	if (params.projection == Projection::PERSPECTIVE) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	} else if (params.projection == Projection::EQUIRECTANGULAR) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	} else if (params.projection == Projection::ORTHOGRAPHIC) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	}
 	cudaSafeCall( cudaGetLastError() );
@@ -109,12 +112,16 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &
  * Convert source screen position to output screen coordinates. Assumes a
  * constant depth of 1m instead of using a depth channel input.
  */
- __global__ void screen_coord_kernel(TextureObject<float> depth_out,
-		TextureObject<short2> screen_out, Camera vcamera, float4x4 pose, Camera camera) {
+ __global__ void screen_coord_kernel(
+	 	short* __restrict__ depth_out,
+		short2* __restrict__ screen_out,
+		int pitch4, int pitch2,
+		Camera vcamera, float4x4 pose, Camera camera) {
+
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) {
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
 		//uint2 screenPos = make_uint2(30000,30000);
 		const float d = camera.maxDepth;
 
@@ -128,15 +135,21 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &
 				screenPos.y >= vcamera.height)
 			screenPos = make_uint2(30000,30000);
 
-		screen_out(x,y) = make_short2(screenPos.x, screenPos.y);
-		depth_out(x,y) = camPos.z;
+		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
+		depth_out[y*pitch2+x] = cudatl::float2fixed<10>(camPos.z);
 	}
 }
 
-void ftl::cuda::screen_coord(TextureObject<float> &depth_out, TextureObject<short2> &screen_out, const Parameters &params, const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
-	const dim3 gridSize((screen_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+void ftl::cuda::screen_coord(cv::cuda::GpuMat &depth_out, cv::cuda::GpuMat &screen_out, const Parameters &params, const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+    const dim3 gridSize((camera.width + THREADS_X - 1)/THREADS_X, (camera.height + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	depth_out.create(camera.height, camera.width, CV_16S);
+	screen_out.create(camera.height, camera.width, CV_16SC2);
 
-	screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, screen_out, params.camera, pose, camera);
+	screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out.ptr<short>(), screen_out.ptr<short2>(), screen_out.step1()/2, depth_out.step1(), params.camera, pose, camera);
 	cudaSafeCall( cudaGetLastError() );
 }
diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp
index 41088cc226e34f4916c6919d8edc465906dbd499..aee5d0bc7dab2357c34713ec4eb628d141b0f248 100644
--- a/components/renderers/cpp/src/splatter_cuda.hpp
+++ b/components/renderers/cpp/src/splatter_cuda.hpp
@@ -7,26 +7,26 @@
 namespace ftl {
 namespace cuda {
 	void screen_coord(
-		ftl::cuda::TextureObject<float> &depth,
-		ftl::cuda::TextureObject<float> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen_out,
+		const cv::cuda::GpuMat &depth,
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out,
 		const ftl::render::Parameters &params,
 		const float4x4 &pose,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);
 
 	void screen_coord(
-		ftl::cuda::TextureObject<float> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen_out,
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out,
 		const ftl::render::Parameters &params,
 		const float4x4 &pose,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);
 
 	void triangle_render1(
-		ftl::cuda::TextureObject<float> &depth_in,
-		ftl::cuda::TextureObject<int> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen,
+		const cv::cuda::GpuMat &depth_in,  // short
+		cv::cuda::GpuMat &depth_out, // int
+		const cv::cuda::GpuMat &screen, // short2
 		const ftl::render::Parameters &params,
 		cudaStream_t stream);
 
@@ -170,6 +170,15 @@ namespace cuda {
 		ftl::cuda::TextureObject<float> &d2,
         float factor, cudaStream_t stream);
 
+	void reverse_verify(
+		cv::cuda::GpuMat &depth_in,
+		const cv::cuda::GpuMat &depth_original,
+		const float4x4 &transformR,
+		const float4x4 &transform,
+		const ftl::rgbd::Camera &vintrin,
+		const ftl::rgbd::Camera &ointrin,
+		cudaStream_t stream);
+
 	void fix_bad_colour(
 		ftl::cuda::TextureObject<float> &depth,
 		ftl::cuda::TextureObject<uchar4> &out,
diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu
index dccdd276ad1ab8a6520a19a5b908c51db6b54823..301cf2a226d0cff47c9a1672fe7d78b124f3fc99 100644
--- a/components/renderers/cpp/src/triangle_render.cu
+++ b/components/renderers/cpp/src/triangle_render.cu
@@ -2,6 +2,7 @@
 #include "splatter_cuda.hpp"
 #include <ftl/rgbd/camera.hpp>
 #include <ftl/cuda_common.hpp>
+#include <ftl/cuda/fixed.hpp>
 
 #include <ftl/cuda/weighting.hpp>
 
@@ -107,7 +108,7 @@ float getZAtCoordinate(const float3 &barycentricCoord, const float (&tri)[3]) {
  * being inside or outside (using bary centric coordinate method). If inside
  * then atomically write to the depth map.
  */
-__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters &params, TextureObject<int> &depth_out) {
+__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters &params, int* depth_out, int out_pitch4) {
 	const int minX = min(v[0].x, min(v[1].x, v[2].x));
 	const int minY = min(v[0].y, min(v[1].y, v[2].y));
 	const int maxX = max(v[0].x, max(v[1].x, v[2].x));
@@ -124,7 +125,7 @@ __device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Pa
 
 				if (sx < params.camera.width && sx >= 0 && sy < params.camera.height && sy >= 0 && isBarycentricCoordInBounds(baryCentricCoordinate)) {
 					float new_depth = getZAtCoordinate(baryCentricCoordinate, d);
-					atomicMin(&depth_out(sx,sy), int(new_depth*100000.0f));
+					atomicMin(&depth_out[sx+sy*out_pitch4], int(new_depth*100000.0f));
 				}
 			}
 		}
@@ -145,11 +146,11 @@ __device__ inline bool isValidTriangle(const short2 (&v)[3]) {
  * which verticies to load.
  */
 template <int A, int B>
-__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const TextureObject<float> &depth_in, const TextureObject<short2> &screen) {
-    d[1] = depth_in.tex2D(x+A,y);
-    d[2] = depth_in.tex2D(x,y+B);
-    v[1] = screen.tex2D(x+A,y);
-	v[2] = screen.tex2D(x,y+B);
+__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const short* __restrict__ depth_in, const short2* __restrict__ screen, int pitch4, int pitch2) {
+    d[1] = fixed2float<10>(depth_in[y*pitch2+x+A]);
+    d[2] = fixed2float<10>(depth_in[(y+B)*pitch2+x]);
+    v[1] = screen[y*pitch4+x+A];
+	v[2] = screen[(y+B)*pitch4+x];
 	return isValidTriangle(v);
 }
 
@@ -157,37 +158,44 @@ __device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const
  * Convert source screen position to output screen coordinates.
  */
  __global__ void triangle_render_kernel(
-        TextureObject<float> depth_in,
-        TextureObject<int> depth_out,
-		TextureObject<short2> screen, Parameters params) {
+        const short* __restrict__ depth_in,
+        int* depth_out,
+		const short2* __restrict__ screen,
+		int width, int height, int pitch2, int pitch4, int out_pitch4, Parameters params) {
+
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if (x >= 1 && x < depth_in.width()-1 && y >= 1 && y < depth_in.height()-1) {
+    if (x >= 1 && x < width-1 && y >= 1 && y < height-1) {
 		float d[3];
-		d[0] = depth_in.tex2D(x,y);
+		d[0] = fixed2float<10>(depth_in[y*pitch2+x]);
 
 		short2 v[3];
-		v[0] = screen.tex2D(x,y);
+		v[0] = screen[y*pitch4+x];
 
 		if (v[0].x < 30000) {
 			// Calculate discontinuity threshold.
 			//const float threshold = (params.depthCoef / ((params.depthCoef / d[0]) - params.disconDisparities)) - d[0];
 
 			// Draw (optionally) 4 triangles as a diamond pattern around the central point.
-			if (loadTriangle<1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
+			if (loadTriangle<1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
 		}
 	}
 }
 
-void ftl::cuda::triangle_render1(TextureObject<float> &depth_in, TextureObject<int> &depth_out, TextureObject<short2> &screen, const Parameters &params, cudaStream_t stream) {
-    const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+void ftl::cuda::triangle_render1(const cv::cuda::GpuMat &depth_in, cv::cuda::GpuMat &depth_out, const cv::cuda::GpuMat &screen, const Parameters &params, cudaStream_t stream) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+	
+	depth_out.create(params.camera.height, params.camera.width, CV_32S);
 
-	triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in.ptr<short>(), depth_out.ptr<int>(), screen.ptr<short2>(), depth_in.cols, depth_in.rows, depth_in.step1(), screen.step1()/2, depth_out.step1(), params);
     cudaSafeCall( cudaGetLastError() );
 }
 
@@ -202,7 +210,8 @@ __global__ void merge_convert_kernel(
 
 	if (x < 0 || x >= depth_in.width() || y < 0 || y >= depth_in.height()) return;
 
-	float a = float(depth_in.tex2D(x,y))*alpha;
+	int d = depth_in.tex2D(x,y);
+	float a = float(d)*alpha;
 	float b = depth_out.tex2D(x,y);
 	depth_out(x,y) = min(a,b);
 }
diff --git a/components/streams/src/feed.cpp b/components/streams/src/feed.cpp
index d5d5533b6a23064f5efafda322b6675e47433f4d..f9b468605bcc336e3bf6d89f137eae7fe252a087 100644
--- a/components/streams/src/feed.cpp
+++ b/components/streams/src/feed.cpp
@@ -15,6 +15,7 @@
 #include <ftl/operators/detectandtrack.hpp>
 #include <ftl/operators/weighting.hpp>
 #include <ftl/operators/mvmls.hpp>
+#include <ftl/operators/fusion.hpp>
 #include <ftl/operators/clipping.hpp>
 #include <ftl/operators/poser.hpp>
 #include <ftl/operators/gt_analysis.hpp>
@@ -533,6 +534,7 @@ void Feed::_createPipeline(uint32_t fsid) {
 		p->append<ftl::operators::BorderMask>("border_mask");
 		p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
 		p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false);
+		p->append<ftl::operators::Fusion>("fusion")->value("enabled", false);
 		p->append<ftl::operators::DisplayMask>("display_mask")->value("enabled", false);
 		p->append<ftl::operators::Poser>("poser")->value("enabled", true);
 		p->append<ftl::operators::GTAnalysis>("gtanalyse");
diff --git a/lib/cudatl/include/cudatl/fixed.hpp b/lib/cudatl/include/cudatl/fixed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12afa70213cbd9326703ada220fdab5e723595aa
--- /dev/null
+++ b/lib/cudatl/include/cudatl/fixed.hpp
@@ -0,0 +1,23 @@
+#ifndef _CUDATL_FIXED_HPP_
+#define _CUDATL_FIXED_HPP_
+
+namespace cudatl
+{
+
+template <int FRAC>
+__device__ inline float fixed2float(short v);
+
+template <int FRAC>
+__device__ inline short float2fixed(float v);
+
+template <int FRAC>
+__device__ inline float fixed2float8(int8_t v);
+
+template <int FRAC>
+__device__ inline int8_t float2fixed8(float v);
+
+}
+
+#include <cudatl/impl/fixed.hpp>
+
+#endif
diff --git a/lib/cudatl/include/cudatl/halfwarp.hpp b/lib/cudatl/include/cudatl/halfwarp.hpp
index 3085381ae4d6b8ca33f2e690232e7204828f6832..c615fe73507b21a41f9bc7b2acf46955eb17c74b 100644
--- a/lib/cudatl/include/cudatl/halfwarp.hpp
+++ b/lib/cudatl/include/cudatl/halfwarp.hpp
@@ -3,15 +3,18 @@
 
 #include <cuda_runtime.h>
 
-namespace cudatl {
+namespace cudatl
+{
 
 static constexpr int HALF_WARP_SIZE = 16;
 static constexpr unsigned int HALF_MASK1 = 0xFFFF0000;
 static constexpr unsigned int HALF_MASK2 = 0x0000FFFF;
 
 template <typename T>
-__device__ inline T halfWarpMin(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpMin(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = min(e, other);
 	}
@@ -19,8 +22,10 @@ __device__ inline T halfWarpMin(T e) {
 }
 
 template <typename T>
-__device__ inline T halfWarpMax(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpMax(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = max(e, other);
 	}
@@ -28,8 +33,10 @@ __device__ inline T halfWarpMax(T e) {
 }
 
 template <typename T>
-__device__ inline T halfWarpSum(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpSum(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e += other;
 	}
diff --git a/lib/cudatl/include/cudatl/host_utility.hpp b/lib/cudatl/include/cudatl/host_utility.hpp
index 39f066c3573f10db41a0d34b0bcbc0a93d93a971..16019260f89fe1cc643544e99a7a5fd8fb49c69d 100644
--- a/lib/cudatl/include/cudatl/host_utility.hpp
+++ b/lib/cudatl/include/cudatl/host_utility.hpp
@@ -4,9 +4,11 @@
 #include <cuda_runtime.hpp>
 #include <string>
 
-namespace cudatl {
+namespace cudatl
+{
 
-inline safeCall(cudaError_t e) {
+inline safeCall(cudaError_t e)
+{
 	if (e != cudaSuccess) throw new std::exception(std::string("Cuda Error "+std::to_string(int(e))));
 }
 
diff --git a/lib/cudatl/include/cudatl/impl/fixed.hpp b/lib/cudatl/include/cudatl/impl/fixed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..083641732acb6735212e899a84e1e530dc37e1d7
--- /dev/null
+++ b/lib/cudatl/include/cudatl/impl/fixed.hpp
@@ -0,0 +1,23 @@
+template <int FRAC>
+__device__ inline float cudatl::fixed2float(short v)
+{
+    return float(v) / float(1 << FRAC);
+}
+
+template <int FRAC>
+__device__ inline short cudatl::float2fixed(float v)
+{
+    return short(v * float(1 << FRAC));
+}
+
+template <int FRAC>
+__device__ inline float cudatl::fixed2float8(int8_t v)
+{
+    return float(v) / float(1 << FRAC);
+}
+
+template <int FRAC>
+__device__ inline int8_t cudatl::float2fixed8(float v)
+{
+    return int8_t(v * float(1 << FRAC));
+}
diff --git a/lib/cudatl/include/cudatl/memory.hpp b/lib/cudatl/include/cudatl/memory.hpp
index 3b249dc043e1ad09365a8d8662b3280dc352710e..a46402a07de4ede20c43a46792dc524fd098c232 100644
--- a/lib/cudatl/include/cudatl/memory.hpp
+++ b/lib/cudatl/include/cudatl/memory.hpp
@@ -3,25 +3,27 @@
 
 #include <cudatl/host_utility.hpp>
 
-namespace cudatl {
+namespace cudatl
+{
 
 template <typename T>
-T *allocate(size_t size) {
-#ifdef USE_GPU
+T *allocate(size_t size)
+{
 	T *ptr;
 	cudatl::safeCall(cudaMalloc(&ptr, size*sizeof(T)));
 	return ptr;
-#else
-	return new T[size];
-#endif
 }
 
 template <typename T>
-T *allocate(size_t width, size_t height, uint &pitch) {
-	if (width == 1 || height == 1) {
+T *allocate(size_t width, size_t height, uint &pitch)
+{
+	if (width == 1 || height == 1)
+	{
 		pitch = width;
 		return allocateMemory<T>((width > height) ? width : height);
-	} else {
+	}
+	else
+	{
 		T *ptr;
 		size_t ptmp;
 		cudatl::safeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height));
@@ -31,7 +33,8 @@ T *allocate(size_t width, size_t height, uint &pitch) {
 }
 
 template <typename T>
-void free(T *ptr) {
+void free(T *ptr)
+{
 	cudatl::safeCall(cudaFree(ptr));
 }
 
diff --git a/lib/cudatl/include/cudatl/warp.hpp b/lib/cudatl/include/cudatl/warp.hpp
index 1a41181e6ec5ab2478dcfa7abcfedc8079d0b0b9..6a8dd898d77058d1f16ffc95fc0350efefd927b5 100644
--- a/lib/cudatl/include/cudatl/warp.hpp
+++ b/lib/cudatl/include/cudatl/warp.hpp
@@ -5,14 +5,17 @@
 
 #define __cuda__ __host__ __device__
 
-namespace cudatl {
+namespace cudatl
+{
 
 static constexpr int WARP_SIZE = 32;
 static constexpr unsigned int FULL_MASK = 0xFFFFFFFF;
 
 template <typename T>
-__device__ inline T warpMin(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpMin(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = min(e, other);
 	}
@@ -20,8 +23,10 @@ __device__ inline T warpMin(T e) {
 }
 
 template <typename T>
-__device__ inline T warpMax(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpMax(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = max(e, other);
 	}
@@ -29,8 +34,10 @@ __device__ inline T warpMax(T e) {
 }
 
 template <typename T>
-__device__ inline T warpSum(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpSum(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e += other;
 	}
@@ -43,9 +50,11 @@ __device__ inline T warpSum(T e) {
  * TODO: This could be more efficient, perhaps with _shfl_XXX
  */
 template <typename T>
-inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) {
+inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold)
+{
 	const int thread = tix%32;
-	for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) {
+	for (uint offset = 1; offset < WARP_SIZE; offset <<= 1)
+	{
 		__syncwarp();
 		const uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
 		__syncwarp();