From e6a15e01090692ba7271bf3e289f90016aa5e152 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Sun, 17 Nov 2019 13:59:23 +0200
Subject: [PATCH] Implements #232 all source MLS

---
 applications/reconstruct/src/ilw/ilw.cpp      |   4 +-
 applications/reconstruct/src/main.cpp         |   4 +-
 components/operators/CMakeLists.txt           |   2 +
 .../operators/include/ftl/operators/mvmls.hpp |  28 +
 components/operators/src/correspondence.cu    | 548 ++++++++++++++++++
 components/operators/src/mvmls.cpp            | 340 +++++++++++
 components/operators/src/mvmls_cuda.hpp       | 101 ++++
 components/renderers/cpp/src/reprojection.cu  |  23 +-
 .../renderers/cpp/src/splatter_cuda.hpp       |   2 +-
 components/renderers/cpp/src/tri_render.cpp   |  19 +-
 .../rgbd-sources/include/ftl/rgbd/frame.hpp   |   2 +-
 11 files changed, 1048 insertions(+), 25 deletions(-)
 create mode 100644 components/operators/include/ftl/operators/mvmls.hpp
 create mode 100644 components/operators/src/correspondence.cu
 create mode 100644 components/operators/src/mvmls.cpp
 create mode 100644 components/operators/src/mvmls_cuda.hpp

diff --git a/applications/reconstruct/src/ilw/ilw.cpp b/applications/reconstruct/src/ilw/ilw.cpp
index 55a9539a9..a686f35c3 100644
--- a/applications/reconstruct/src/ilw/ilw.cpp
+++ b/applications/reconstruct/src/ilw/ilw.cpp
@@ -25,7 +25,7 @@ static Eigen::Affine3d create_rotation_matrix(float ax, float ay, float az) {
 }
 
 ILW::ILW(nlohmann::json &config) : ftl::Configurable(config) {
-    enabled_ = value("ilw_align", true);
+    enabled_ = value("ilw_align", false);
     iterations_ = value("iterations", 4);
     motion_rate_ = value("motion_rate", 0.8f);
     motion_window_ = value("motion_window", 3);
@@ -42,7 +42,7 @@ ILW::ILW(nlohmann::json &config) : ftl::Configurable(config) {
     });
 
 	on("ilw_align", [this](const ftl::config::Event &e) {
-        enabled_ = value("ilw_align", true);
+        enabled_ = value("ilw_align", false);
     });
 
     on("iterations", [this](const ftl::config::Event &e) {
diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp
index 382f49293..0603dad40 100644
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -37,6 +37,7 @@
 #include <ftl/operators/segmentation.hpp>
 #include <ftl/operators/mask.hpp>
 #include <ftl/operators/antialiasing.hpp>
+#include <ftl/operators/mvmls.hpp>
 #include <ftl/operators/clipping.hpp>
 
 #include <ftl/cuda/normals.hpp>
@@ -330,8 +331,9 @@ static void run(ftl::Configurable *root) {
 	pipeline1->append<ftl::operators::CrossSupport>("cross");
 	pipeline1->append<ftl::operators::DiscontinuityMask>("discontinuity");
 	pipeline1->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
-	pipeline1->append<ftl::operators::AggreMLS>("mls");  // Perform MLS (using smoothing channel)
+	//pipeline1->append<ftl::operators::AggreMLS>("mls");  // Perform MLS (using smoothing channel)
 	pipeline1->append<ftl::operators::VisCrossSupport>("viscross")->set("enabled", false);
+	pipeline1->append<ftl::operators::MultiViewMLS>("mvmls");
 	// Alignment
 
 
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index 20dbced5e..b025a39b3 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -17,6 +17,8 @@ set(OPERSRC
 	src/mask.cpp
 	src/antialiasing.cpp
 	src/antialiasing.cu
+	src/mvmls.cpp
+	src/correspondence.cu
 	src/clipping.cpp
 )
 
diff --git a/components/operators/include/ftl/operators/mvmls.hpp b/components/operators/include/ftl/operators/mvmls.hpp
new file mode 100644
index 000000000..6b8eff1be
--- /dev/null
+++ b/components/operators/include/ftl/operators/mvmls.hpp
@@ -0,0 +1,28 @@
+#ifndef _FTL_OPERATORS_MVMLS_HPP_
+#define _FTL_OPERATORS_MVMLS_HPP_
+
+#include <ftl/operators/operator.hpp>
+
+namespace ftl {
+namespace operators {
+
+class MultiViewMLS : public ftl::operators::Operator {
+	public:
+	explicit MultiViewMLS(ftl::Configurable*);
+	~MultiViewMLS();
+
+	inline Operator::Type type() const override { return Operator::Type::ManyToMany; }
+
+	bool apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) override;
+
+	private:
+	std::vector<ftl::cuda::TextureObject<float4>> centroid_horiz_;
+	std::vector<ftl::cuda::TextureObject<float4>> centroid_vert_;
+	std::vector<ftl::cuda::TextureObject<float4>> normals_horiz_;
+    std::vector<ftl::cuda::TextureObject<float>> contributions_;
+};
+
+}
+}
+
+#endif
diff --git a/components/operators/src/correspondence.cu b/components/operators/src/correspondence.cu
new file mode 100644
index 000000000..36e91d9f4
--- /dev/null
+++ b/components/operators/src/correspondence.cu
@@ -0,0 +1,548 @@
+#include "mvmls_cuda.hpp"
+#include <ftl/cuda/weighting.hpp>
+#include <ftl/cuda/mask.hpp>
+
+using ftl::cuda::TextureObject;
+using ftl::rgbd::Camera;
+using ftl::cuda::Mask;
+using ftl::cuda::MvMLSParams;
+
+#define T_PER_BLOCK 8
+
+template<int FUNCTION>
+__device__ float weightFunction(const ftl::cuda::MvMLSParams &params, float dweight, float cweight);
+
+template <>
+__device__ inline float weightFunction<0>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<1>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
+	return (cweight * cweight * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<2>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
+	return (dweight * dweight * cweight);
+}
+
+template <>
+__device__ inline float weightFunction<3>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (dweight == 0.0f) ? 0.0f : (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<4>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return cweight;
+}
+
+template <>
+__device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (cweight > 0.0f) ? dweight : 0.0f;
+}
+
+template<int COR_STEPS, int FUNCTION> 
+__global__ void corresponding_point_kernel(
+        TextureObject<float> d1,
+        TextureObject<float> d2,
+        TextureObject<uchar4> c1,
+        TextureObject<uchar4> c2,
+        TextureObject<short2> screenOut,
+		TextureObject<float> conf,
+		TextureObject<int> mask,
+        float4x4 pose1,
+        float4x4 pose1_inv,
+        float4x4 pose2,  // Inverse
+        Camera cam1,
+        Camera cam2, ftl::cuda::MvMLSParams params) {
+
+    // Each warp picks point in p1
+    //const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x); // / WARP_SIZE;
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
+        screenOut(x,y) = make_short2(-1,-1);
+    
+        //const float3 world1 = make_float3(p1.tex2D(x, y));
+        const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z;  // Initial starting depth
+        if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return;
+
+        // TODO: Temporary hack to ensure depth1 is present
+        //const float4 temp = vout.tex2D(x,y);
+        //vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
+        
+        const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
+
+        const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
+
+        //float bestdepth = 0.0f;
+        short2 bestScreen = make_short2(-1,-1);
+		float bestdepth = 0.0f;
+		float bestdepth2 = 0.0f;
+        float bestweight = 0.0f;
+        float bestcolour = 0.0f;
+        float bestdweight = 0.0f;
+        float totalcolour = 0.0f;
+        int count = 0;
+        float contrib = 0.0f;
+        
+        const float step_interval = params.range / (COR_STEPS / 2);
+        
+        const float3 rayStep_world = pose1.getFloat3x3() * cam1.screenToCam(x,y,step_interval);
+        const float3 rayStart_2 = pose2 * world1;
+        const float3 rayStep_2 = pose2.getFloat3x3() * rayStep_world;
+
+        // Project to p2 using cam2
+        // Each thread takes a possible correspondence and calculates a weighting
+        //const int lane = tid % WARP_SIZE;
+        for (int i=0; i<COR_STEPS; ++i) {
+            const int j = i - (COR_STEPS/2);
+            const float depth_adjust = (float)j * step_interval;
+
+            // Calculate adjusted depth 3D point in camera 2 space
+            const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
+            const float3 camPos = rayStart_2 + j * rayStep_2; //pose2 * worldPos;
+			const float2 screen = cam2.camToScreen<float2>(camPos);
+			
+			float weight = (screen.x >= cam2.width || screen.y >= cam2.height) ? 0.0f : 1.0f;
+
+			// Generate a colour correspondence value
+            const auto colour2 = c2.tex2D(screen.x, screen.y);
+            const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
+
+            // Generate a depth correspondence value
+			const float depth2 = d2.tex2D(int(screen.x+0.5f), int(screen.y+0.5f));
+			
+			if (FUNCTION == 1) {
+				weight *= ftl::cuda::weighting(fabs(depth2 - camPos.z), cweight*params.spatial_smooth);
+			} else {
+				const float dweight = ftl::cuda::weighting(fabs(depth2 - camPos.z), params.spatial_smooth);
+            	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
+			}
+            //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
+
+            //weight *= weightFunction<FUNCTION>(params, dweight, cweight);
+
+            ++count;
+            contrib += weight;
+            bestcolour = max(cweight, bestcolour);
+            //bestdweight = max(dweight, bestdweight);
+            totalcolour += cweight;
+			bestdepth = (weight > bestweight) ? depth_adjust : bestdepth;
+			//bestdepth2 = (weight > bestweight) ? camPos.z : bestdepth2;
+			//bestScreen = (weight > bestweight) ? make_short2(screen.x+0.5f, screen.y+0.5f) : bestScreen;
+			bestweight = max(bestweight, weight);
+                //bestweight = weight;
+                //bestdepth = depth_adjust;
+                //bestScreen = make_short2(screen.x+0.5f, screen.y+0.5f);
+            //}
+        }
+
+        const float avgcolour = totalcolour/(float)count;
+        const float confidence = bestcolour / totalcolour; //bestcolour - avgcolour;
+        
+        //Mask m(mask.tex2D(x,y));
+
+        //if (bestweight > 0.0f) {
+            float old = conf.tex2D(x,y);
+
+            if (bestweight * confidence > old) {
+				d1(x,y) = 0.4f*bestdepth + depth1;
+				//d2(bestScreen.x, bestScreen.y) = bestdepth2;
+                //screenOut(x,y) = bestScreen;
+                conf(x,y) = bestweight * confidence;
+            }
+        //}
+        
+        // If a good enough match is found, mark dodgy depth as solid
+        //if ((m.isFilled() || m.isDiscontinuity()) && (bestweight > params.match_threshold)) mask(x,y) = 0;
+    }
+}
+
+void ftl::cuda::correspondence(
+        TextureObject<float> &d1,
+        TextureObject<float> &d2,
+        TextureObject<uchar4> &c1,
+        TextureObject<uchar4> &c2,
+        TextureObject<short2> &screen,
+		TextureObject<float> &conf,
+		TextureObject<int> &mask,
+        float4x4 &pose1,
+        float4x4 &pose1_inv,
+        float4x4 &pose2,
+        const Camera &cam1,
+        const Camera &cam2, const MvMLSParams &params, int func,
+        cudaStream_t stream) {
+
+	const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    //printf("COR SIZE %d,%d\n", p1.width(), p1.height());
+
+	switch (func) {
+    case 0: corresponding_point_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 1: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 2: corresponding_point_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 3: corresponding_point_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 4: corresponding_point_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 5: corresponding_point_kernel<16,5><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	}
+
+    cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Remove zero-confidence =================================================
+
+__global__ void zero_confidence_kernel(
+		TextureObject<float> conf,
+		TextureObject<float> depth) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		const float c = conf.tex2D((int)x,(int)y);
+
+		if (c == 0.0f) {
+			depth(x,y) = 1000.0f;	
+		}
+	}
+}
+
+void ftl::cuda::zero_confidence(TextureObject<float> &conf, TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	zero_confidence_kernel<<<gridSize, blockSize, 0, stream>>>(conf, depth);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+
+// ==== MultiViewMLS Aggregate =================================================
+
+__device__ inline short3 getScreenPos(int x, int y, float d, const Camera &cam1, const Camera &cam2, const float4x4 &transform) {
+    const float3 campos = transform * cam1.screenToCam(x,y,d);
+    const int2 screen = cam2.camToScreen<int2>(campos);
+    return make_short3(screen.x, screen.y, campos.z);
+}
+
+__device__ inline short2 packScreen(int x, int y, int id) {
+    return make_short2((id << 12) + x, y);
+}
+
+__device__ inline short2 packScreen(const short3 &p, int id) {
+    return make_short2((id << 12) + p.x, p.y);
+}
+
+__device__ inline int supportSize(uchar4 support) {
+    return (support.x+support.y) * (support.z+support.w);
+}
+
+__device__ inline short2 choosePoint(uchar4 sup1, uchar4 sup2, float dot1, float dot2, short2 screen1, short2 screen2) {
+    //return (float(supportSize(sup2))*dot1 > float(supportSize(sup1))*dot2) ? screen2 : screen1;
+    return (dot1 > dot2) ? screen2 : screen1;
+}
+
+__device__ inline int unpackCameraID(short2 p) {
+    return p.x >> 12;
+}
+
+/**
+ * Identify which source has the best support region for a given pixel.
+ */
+__global__ void best_sources_kernel(
+        TextureObject<float4> normals1,
+        TextureObject<float4> normals2,
+        TextureObject<uchar4> support1,
+        TextureObject<uchar4> support2,
+        TextureObject<float> depth1,
+        TextureObject<float> depth2,
+        TextureObject<short2> screen,
+        float4x4 transform,
+        //float3x3 transformR,
+        ftl::rgbd::Camera cam1,
+        ftl::rgbd::Camera cam2,
+        int id1,
+        int id2) {
+
+    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x >= 0 && y >= 0 && x < screen.width() && y < screen.height()) {
+        const float d1 = depth1.tex2D(x,y);
+
+        const short3 scr2 = getScreenPos(x, y, d1, cam1, cam2, transform);
+        short2 bestPoint = packScreen(x,y,0);
+
+        if (scr2.x >= 0 && scr2.y >= 0 && scr2.x < cam2.width && scr2.y < cam2.height) {
+            uchar4 sup1 = support1.tex2D(x,y);
+            uchar4 sup2 = support2.tex2D(scr2.x,scr2.y);
+            const float d2 = depth2.tex2D(scr2.x,scr2.y);
+            float3 n1 = transform.getFloat3x3() * make_float3(normals1.tex2D(x,y));
+            float3 n2 = make_float3(normals2.tex2D(scr2.x,scr2.y));
+
+            float3 camray = cam2.screenToCam(scr2.x,scr2.y,1.0f);
+            camray /= length(camray);
+            const float dot1 = dot(camray, n1);
+            const float dot2 = dot(camray, n2);
+
+            bestPoint = (fabs(scr2.z - d2) < 0.04f) ? choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2)) : packScreen(x,y,6);
+            //bestPoint = choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2));
+			//bestPoint = (d1 < d2) ? packScreen(x,y,id1) : packScreen(x,y,id2);
+			
+			bestPoint = (fabs(scr2.z - d2) < 0.04f) ? packScreen(scr2,id2) : packScreen(scr2,id1);
+        }
+
+        screen(x,y) = bestPoint;
+
+        /*if (s.x >= 0 && s.y >= 0) {
+            auto norm1 = make_float3(n1.tex2D(x,y));
+            const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
+            //n2(s.x,s.y) = norm1;
+
+            float3 cent1 = make_float3(c1.tex2D(x,y));
+            const auto cent2 = make_float3(c2.tex2D(s.x,s.y));
+
+            if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f) {
+                norm1 += poseInv1.getFloat3x3() * (pose2.getFloat3x3() * norm2);
+                n1(x,y) = make_float4(norm1, 0.0f);
+				cent1 +=  poseInv1 * (pose2 * cent2);  // FIXME: Transform between camera spaces
+				cent1 /= 2.0f;
+                c1(x,y) = make_float4(cent1, 0.0f);
+                //c2(s.x,s.y) = cent1;
+
+				//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
+            }
+           // contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
+        }*/
+    }
+}
+
+void ftl::cuda::best_sources(
+        ftl::cuda::TextureObject<float4> &normals1,
+        ftl::cuda::TextureObject<float4> &normals2,
+        ftl::cuda::TextureObject<uchar4> &support1,
+        ftl::cuda::TextureObject<uchar4> &support2,
+        ftl::cuda::TextureObject<float> &depth1,
+        ftl::cuda::TextureObject<float> &depth2,
+        ftl::cuda::TextureObject<short2> &screen,
+        const float4x4 &transform,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2,
+        int id1,
+        int id2,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(normals1, normals2, support1, support2, depth1, depth2, screen, transform, cam1, cam2, id1, id2);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+/**
+ * Identify which source has the best support region for a given pixel.
+ */
+ __global__ void aggregate_sources_kernel(
+		TextureObject<float4> n1,
+		TextureObject<float4> n2,
+		TextureObject<float4> c1,
+		TextureObject<float4> c2,
+		TextureObject<float> depth1,
+		//TextureObject<float> depth2,
+		//TextureObject<short2> screen,
+		float4x4 transform,
+		//float3x3 transformR,
+		ftl::rgbd::Camera cam1,
+		ftl::rgbd::Camera cam2) {
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x >= 0 && y >= 0 && x < n1.width() && y < n1.height()) {
+		const float d1 = depth1.tex2D(x,y);
+
+		if (d1 > cam1.minDepth && d1 < cam1.maxDepth) {
+			//const short3 s = getScreenPos(x, y, d1, cam1, cam2, transform);
+			const float3 camPos = transform * cam1.screenToCam(x, y, d1);
+			const int2 s = cam2.camToScreen<int2>(camPos);
+
+			if (s.x >= 0 && s.y >= 0 && s.x < n2.width() && s.y < n2.height()) {
+				auto norm1 = make_float3(n1.tex2D(x,y));
+				const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
+				//n2(s.x,s.y) = norm1;
+
+				float3 cent1 = make_float3(c1.tex2D(x,y));
+				const auto cent2 = transform.getInverse() * make_float3(c2.tex2D(s.x,s.y));
+
+				//printf("MERGING %f\n", length(cent2-cent1));
+
+				if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f && length(cent2-cent1) < 0.04f) {
+					norm1 += norm2;
+					norm1 /= 2.0f;
+					n1(x,y) = make_float4(norm1, 0.0f);
+					cent1 += cent2;
+					cent1 /= 2.0f;
+					c1(x,y) = make_float4(cent1, 0.0f);
+					//c2(s.x,s.y) = cent1;
+
+					//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
+				}
+			// contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
+			}
+		}
+	}
+}
+
+void ftl::cuda::aggregate_sources(
+		ftl::cuda::TextureObject<float4> &n1,
+		ftl::cuda::TextureObject<float4> &n2,
+		ftl::cuda::TextureObject<float4> &c1,
+		ftl::cuda::TextureObject<float4> &c2,
+		ftl::cuda::TextureObject<float> &depth1,
+		//ftl::cuda::TextureObject<float> &depth2,
+		//ftl::cuda::TextureObject<short2> &screen,
+		const float4x4 &transform,
+		const ftl::rgbd::Camera &cam1,
+		const ftl::rgbd::Camera &cam2,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((n1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (n1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, depth1, transform, cam1, cam2);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+__device__ static uchar4 HSVtoRGB(int H, float S, float V) {
+	const float C = S * V;
+	const float X = C * (1 - fabs(fmodf(H / 60.0f, 2) - 1));
+	const float m = V - C;
+	float Rs, Gs, Bs;
+
+	if(H >= 0 && H < 60) {
+		Rs = C;
+		Gs = X;
+		Bs = 0;	
+	}
+	else if(H >= 60 && H < 120) {	
+		Rs = X;
+		Gs = C;
+		Bs = 0;	
+	}
+	else if(H >= 120 && H < 180) {
+		Rs = 0;
+		Gs = C;
+		Bs = X;	
+	}
+	else if(H >= 180 && H < 240) {
+		Rs = 0;
+		Gs = X;
+		Bs = C;	
+	}
+	else if(H >= 240 && H < 300) {
+		Rs = X;
+		Gs = 0;
+		Bs = C;	
+	}
+	else {
+		Rs = C;
+		Gs = 0;
+		Bs = X;	
+	}
+
+	return make_uchar4((Bs + m) * 255, (Gs + m) * 255, (Rs + m) * 255, 0);
+}
+
+/**
+ * Render each pixel is a colour corresponding to the source camera with the
+ * best support window.
+ */
+ __global__ void vis_best_sources_kernel(
+        TextureObject<short2> screen,
+        TextureObject<uchar4> colour,
+        int myid,
+        int count) {
+
+    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x >= 0 && y >= 0 && x < colour.width() && y < colour.height()) {
+        short2 s = screen.tex2D(x,y);
+        int id = unpackCameraID(s);
+
+        uchar4 c = HSVtoRGB((360 / count) * id, 0.6f, 0.85f);
+        if (myid != id) colour(x,y) = c;
+        //colour(x,y) = c;
+    }
+}
+
+void ftl::cuda::vis_best_sources(
+        ftl::cuda::TextureObject<short2> &screen,
+        ftl::cuda::TextureObject<uchar4> &colour,
+        int myid,
+        int count,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    vis_best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(screen, colour, myid, count);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+/*void ftl::cuda::aggregate_sources(
+        ftl::cuda::TextureObject<float4> &n1,
+        ftl::cuda::TextureObject<float4> &n2,
+        ftl::cuda::TextureObject<float4> &c1,
+        ftl::cuda::TextureObject<float4> &c2,
+        ftl::cuda::TextureObject<float> &contribs1,
+        ftl::cuda::TextureObject<float> &contribs2,
+		ftl::cuda::TextureObject<short2> &screen,
+		const float4x4 &poseInv1,
+		const float4x4 &pose2,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, contribs1, contribs2, screen, poseInv1, pose2);
+    cudaSafeCall( cudaGetLastError() );
+}*/
+
+// ==== Normalise aggregations =================================================
+
+__global__ void normalise_aggregations_kernel(
+        TextureObject<float4> norms,
+        TextureObject<float4> cents,
+        TextureObject<float> contribs) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < norms.width() && y < norms.height()) {
+        const float contrib = contribs.tex2D((int)x,(int)y);
+
+        const auto a = norms.tex2D((int)x,(int)y);
+        const auto b = cents.tex2D(x,y);
+        //const float4 normal = normals.tex2D((int)x,(int)y);
+
+		//out(x,y) = (contrib == 0.0f) ? make<B>(a) : make<B>(a / contrib);
+
+        if (contrib > 0.0f) {
+            norms(x,y) = a / (contrib+1.0f);
+            cents(x,y) = b / (contrib+1.0f);
+        }
+    }
+}
+
+void ftl::cuda::normalise_aggregations(TextureObject<float4> &norms, TextureObject<float4> &cents, TextureObject<float> &contribs, cudaStream_t stream) {
+    const dim3 gridSize((norms.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norms.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    normalise_aggregations_kernel<<<gridSize, blockSize, 0, stream>>>(norms, cents, contribs);
+    cudaSafeCall( cudaGetLastError() );
+}
+
diff --git a/components/operators/src/mvmls.cpp b/components/operators/src/mvmls.cpp
new file mode 100644
index 000000000..4c02a7607
--- /dev/null
+++ b/components/operators/src/mvmls.cpp
@@ -0,0 +1,340 @@
+#include <ftl/operators/mvmls.hpp>
+#include "smoothing_cuda.hpp"
+#include <ftl/utility/matrix_conversion.hpp>
+#include "mvmls_cuda.hpp"
+
+using ftl::operators::MultiViewMLS;
+using ftl::codecs::Channel;
+using cv::cuda::GpuMat;
+using ftl::rgbd::Format;
+
+MultiViewMLS::MultiViewMLS(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {
+
+}
+
+MultiViewMLS::~MultiViewMLS() {
+
+}
+
+bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) {
+    cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+    float thresh = config()->value("mls_threshold", 0.4f);
+	float col_smooth = config()->value("mls_colour_smoothing", 30.0f);
+	int iters = config()->value("mls_iterations", 3);
+	int radius = config()->value("mls_radius",5);
+	//bool aggre = config()->value("aggregation", true);
+    int win = config()->value("cost_function",1);
+    bool do_corr = config()->value("merge_corresponding", true);
+	bool do_aggr = config()->value("merge_mls", false);
+	bool cull_zero = config()->value("cull_no_confidence", false);
+    //bool show_best_source = config()->value("show_pixel_source", false);
+
+    ftl::cuda::MvMLSParams params;
+    params.range = config()->value("search_range", 0.05f);
+    params.fill_match = config()->value("fill_match", 50.0f);
+    params.fill_threshold = config()->value("fill_threshold", 0.0f);
+	params.match_threshold = config()->value("match_threshold", 0.3f);
+    params.colour_smooth = config()->value("colour_smooth", 150.0f);
+    params.spatial_smooth = config()->value("spatial_smooth", 0.04f);
+    params.cost_ratio = config()->value("cost_ratio", 0.2f);
+	params.cost_threshold = config()->value("cost_threshold", 1.0f);
+
+    // Make sure we have enough buffers
+    if (normals_horiz_.size() < in.frames.size()) {
+        normals_horiz_.resize(in.frames.size());
+        centroid_horiz_.resize(in.frames.size());
+        centroid_vert_.resize(in.frames.size());
+        contributions_.resize(in.frames.size());
+    }
+
+    // Make sure all buffers are at correct resolution and are allocated
+    for (size_t i=0; i<in.frames.size(); ++i) {
+        auto &f = in.frames[i];
+	    auto size = f.get<GpuMat>(Channel::Depth).size();
+	    centroid_horiz_[i].create(size.height, size.width);
+	    normals_horiz_[i].create(size.height, size.width);
+	    centroid_vert_[i].create(size.width, size.height);
+        contributions_[i].create(size.width, size.height);
+
+        if (!f.hasChannel(Channel::Normals)) {
+            LOG(ERROR) << "Required normals channel missing for MLS";
+            return false;
+        }
+        if (!f.hasChannel(Channel::Support1)) {
+            LOG(ERROR) << "Required cross support channel missing for MLS";
+            return false;
+        }
+
+        // Create required channels
+        f.create<GpuMat>(Channel::Confidence, Format<float>(size));
+        f.createTexture<float>(Channel::Confidence);
+        f.create<GpuMat>(Channel::Screen, Format<short2>(size));
+        f.createTexture<short2>(Channel::Screen);
+    }
+
+    for (int iter=0; iter<iters; ++iter) {
+		// Step 1:
+		// Calculate correspondences and adjust depth values
+		// Step 2:
+        // Find corresponding points and perform aggregation of any correspondences
+        // For each camera combination
+        if (do_corr) {
+            for (size_t i=0; i<in.frames.size(); ++i) {
+                auto &f1 = in.frames[i];
+                //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+                f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+                Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+                d1 = in.sources[i]->getPose() * d1;
+
+                for (size_t j=0; j<in.frames.size(); ++j) {
+                    if (i == j) continue;
+
+                    //LOG(INFO) << "Running phase1";
+
+                    auto &f2 = in.frames[j];
+                    auto s1 = in.sources[i];
+                    auto s2 = in.sources[j];
+
+                    // Are cameras facing similar enough direction?
+                    Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+                    d2 = in.sources[j]->getPose() * d2;
+                    // No, so skip this combination
+                    if (d1.dot(d2) <= 0.0) continue;
+
+                    auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
+                    auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
+                    auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
+					auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
+
+                    auto transform = pose2 * pose1;
+
+                    //Calculate screen positions of estimated corresponding points
+                    ftl::cuda::correspondence(
+                        f1.getTexture<float>(Channel::Depth),
+                        f2.getTexture<float>(Channel::Depth),
+                        f1.getTexture<uchar4>(Channel::Colour),
+                        f2.getTexture<uchar4>(Channel::Colour),
+                        // TODO: Add normals and other things...
+                        f1.getTexture<short2>(Channel::Screen),
+                        f1.getTexture<float>(Channel::Confidence),
+                        f1.getTexture<int>(Channel::Mask),
+                        pose1,
+                        pose1_inv,
+                        pose2,
+                        s1->parameters(),
+                        s2->parameters(),
+                        params,
+                        win,
+                        stream
+                    );
+
+                    // Also calculate best source for each point
+                    /*ftl::cuda::best_sources(
+                        f1.getTexture<uchar4>(Channel::Support1),
+                        f2.getTexture<uchar4>(Channel::Support1),
+                        f1.getTexture<float>(Channel::Depth),
+                        f2.getTexture<float>(Channel::Depth),
+                        f1.getTexture<short2>(Channel::Screen),
+                        transform,
+                        s1->parameters(),
+                        s2->parameters(),
+                        i, j, stream
+                    );*/
+				}
+			}
+		}
+
+        // Find best source for every pixel
+        /*for (size_t i=0; i<in.frames.size(); ++i) {
+            auto &f1 = in.frames[i];
+            //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+            //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+            Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+            d1 = in.sources[i]->getPose() * d1;
+
+            for (size_t j=0; j<in.frames.size(); ++j) {
+                if (i == j) continue;
+
+                //LOG(INFO) << "Running phase1";
+
+                auto &f2 = in.frames[j];
+                auto s1 = in.sources[i];
+                auto s2 = in.sources[j];
+
+                // Are cameras facing similar enough direction?
+                Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+                d2 = in.sources[j]->getPose() * d2;
+                // No, so skip this combination
+                if (d1.dot(d2) <= 0.0) continue;
+
+                auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
+                auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
+                auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
+                auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
+
+                auto transform = pose2 * pose1;
+
+                // Also calculate best source for each point
+                ftl::cuda::best_sources(
+                    f1.getTexture<float4>(Channel::Normals),
+                    f2.getTexture<float4>(Channel::Normals),
+                    f1.getTexture<uchar4>(Channel::Support1),
+                    f2.getTexture<uchar4>(Channel::Support1),
+                    f1.getTexture<float>(Channel::Depth),
+                    f2.getTexture<float>(Channel::Depth),
+                    f1.getTexture<short2>(Channel::Screen),
+                    transform,
+                    s1->parameters(),
+                    s2->parameters(),
+                    i, j, stream
+                );
+            }
+        }*/
+
+        // Step 2:
+        // Do the horizontal and vertical MLS aggregations for each source
+        // But don't do the final move step.
+        for (size_t i=0; i<in.frames.size(); ++i) {
+            auto &f = in.frames[i];
+            auto *s = in.sources[i];
+
+            // Clear data
+            cv::cuda::GpuMat data(contributions_[i].height(), contributions_[i].width(), CV_32F, contributions_[i].pixelPitch());
+            data.setTo(cv::Scalar(0.0f), cvstream);
+
+			if (cull_zero && iter == iters-1) {
+				ftl::cuda::zero_confidence(
+					f.getTexture<float>(Channel::Confidence),
+					f.getTexture<float>(Channel::Depth),
+					stream
+				);
+			}
+
+            ftl::cuda::mls_aggr_horiz(
+                f.createTexture<uchar4>(Channel::Support1),
+                f.createTexture<float4>(Channel::Normals),
+                normals_horiz_[i],
+                f.createTexture<float>(Channel::Depth),
+                centroid_horiz_[i],
+                f.createTexture<uchar4>(Channel::Colour),
+                thresh,
+                col_smooth,
+                radius,
+                s->parameters(),
+                stream
+            );
+
+            ftl::cuda::mls_aggr_vert(
+                f.getTexture<uchar4>(Channel::Support1),
+                normals_horiz_[i],
+                f.getTexture<float4>(Channel::Normals),
+                centroid_horiz_[i],
+                centroid_vert_[i],
+                f.getTexture<uchar4>(Channel::Colour),
+                f.getTexture<float>(Channel::Depth),
+                thresh,
+                col_smooth,
+                radius,
+                s->parameters(),
+                stream
+            );
+        }
+
+        // Step 3:
+        // Find corresponding points and perform aggregation of any correspondences
+        // For each camera combination
+        if (do_aggr) {
+            for (size_t i=0; i<in.frames.size(); ++i) {
+                auto &f1 = in.frames[i];
+                //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+                f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+                Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+                d1 = in.sources[i]->getPose() * d1;
+
+                for (size_t j=0; j<in.frames.size(); ++j) {
+                    if (i == j) continue;
+
+                    //LOG(INFO) << "Running phase1";
+
+                    auto &f2 = in.frames[j];
+                    auto s1 = in.sources[i];
+                    auto s2 = in.sources[j];
+
+                    // Are cameras facing similar enough direction?
+                    Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+                    d2 = in.sources[j]->getPose() * d2;
+                    // No, so skip this combination
+                    if (d1.dot(d2) <= 0.0) continue;
+
+                    auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
+					auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
+					auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
+					auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
+
+					auto transform = pose2 * pose1;
+
+                    // For the corresponding points, combine normals and centroids
+                    ftl::cuda::aggregate_sources(
+                        f1.getTexture<float4>(Channel::Normals),
+                        f2.getTexture<float4>(Channel::Normals),
+                        centroid_vert_[i],
+                        centroid_vert_[j],
+						f1.getTexture<float>(Channel::Depth),
+                        //contributions_[i],
+                        //contributions_[j],
+                        //f1.getTexture<short2>(Channel::Screen),
+						transform,
+						s1->parameters(),
+						s2->parameters(),
+                        stream
+                    );
+
+                    //LOG(INFO) << "Correspondences done... " << i;
+                }
+            }
+        }
+
+        // Step 3:
+        // Normalise aggregations and move the points
+        for (size_t i=0; i<in.frames.size(); ++i) {
+            auto &f = in.frames[i];
+            auto *s = in.sources[i];
+            auto size = f.get<GpuMat>(Channel::Depth).size();
+
+            /*if (do_corr) {
+                ftl::cuda::normalise_aggregations(
+                    f.getTexture<float4>(Channel::Normals),
+                    centroid_vert_[i],
+                    contributions_[i],
+                    stream
+                );
+            }*/
+
+            ftl::cuda::mls_adjust_depth(
+                f.getTexture<float4>(Channel::Normals),
+                centroid_vert_[i],
+                f.getTexture<float>(Channel::Depth),
+                f.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)),
+                s->parameters(),
+                stream
+            );
+
+            f.swapChannels(Channel::Depth, Channel::Depth2);
+
+            /*if (show_best_source) {
+                ftl::cuda::vis_best_sources(
+                    f.getTexture<short2>(Channel::Screen),
+                    f.getTexture<uchar4>(Channel::Colour),
+                    i,
+                    7, stream
+                );
+            }*/
+        }
+    }
+
+    return true;
+}
diff --git a/components/operators/src/mvmls_cuda.hpp b/components/operators/src/mvmls_cuda.hpp
new file mode 100644
index 000000000..93b1e8d88
--- /dev/null
+++ b/components/operators/src/mvmls_cuda.hpp
@@ -0,0 +1,101 @@
+#ifndef _FTL_CUDA_MVMLS_HPP_
+#define _FTL_CUDA_MVMLS_HPP_
+
+#include <ftl/rgbd/camera.hpp>
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+
+namespace ftl {
+namespace cuda {
+
+struct MvMLSParams {
+    float spatial_smooth;
+    float colour_smooth;
+	float fill_match;
+	float fill_threshold;
+	float match_threshold;
+    float cost_ratio;
+    float cost_threshold;
+	float range;
+    uint flags;
+};
+
+void correspondence(
+        ftl::cuda::TextureObject<float> &d1,
+        ftl::cuda::TextureObject<float> &d2,
+        ftl::cuda::TextureObject<uchar4> &c1,
+        ftl::cuda::TextureObject<uchar4> &c2,
+        ftl::cuda::TextureObject<short2> &screen,
+		ftl::cuda::TextureObject<float> &conf,
+		ftl::cuda::TextureObject<int> &mask,
+        float4x4 &pose1,
+        float4x4 &pose1_inv,
+        float4x4 &pose2,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
+        cudaStream_t stream);
+
+void zero_confidence(
+		ftl::cuda::TextureObject<float> &conf,
+		ftl::cuda::TextureObject<float> &depth,
+		cudaStream_t stream);
+
+/*void aggregate_sources(
+    ftl::cuda::TextureObject<float4> &n1,
+    ftl::cuda::TextureObject<float4> &n2,
+    ftl::cuda::TextureObject<float4> &c1,
+    ftl::cuda::TextureObject<float4> &c2,
+    ftl::cuda::TextureObject<float> &contribs1,
+    ftl::cuda::TextureObject<float> &contribs2,
+    ftl::cuda::TextureObject<short2> &screen,
+	//const float4x4 &pose1,
+	//const float4x4 &poseInv2,
+	const float4x4 &poseInv1,
+	const float4x4 &pose2,
+    cudaStream_t stream);*/
+
+void aggregate_sources(
+		ftl::cuda::TextureObject<float4> &n1,
+		ftl::cuda::TextureObject<float4> &n2,
+		ftl::cuda::TextureObject<float4> &c1,
+		ftl::cuda::TextureObject<float4> &c2,
+		ftl::cuda::TextureObject<float> &depth1,
+		//ftl::cuda::TextureObject<float> &depth2,
+		//ftl::cuda::TextureObject<short2> &screen,
+		const float4x4 &transform,
+		const ftl::rgbd::Camera &cam1,
+		const ftl::rgbd::Camera &cam2,
+		cudaStream_t stream);
+
+void best_sources(
+        ftl::cuda::TextureObject<float4> &normals1,
+        ftl::cuda::TextureObject<float4> &normals2,
+        ftl::cuda::TextureObject<uchar4> &support1,
+        ftl::cuda::TextureObject<uchar4> &suppor2,
+        ftl::cuda::TextureObject<float> &depth1,
+        ftl::cuda::TextureObject<float> &depth2,
+        ftl::cuda::TextureObject<short2> &screen,
+        const float4x4 &transform,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2,
+        int id1,
+        int id2,
+        cudaStream_t stream);
+
+void vis_best_sources(
+        ftl::cuda::TextureObject<short2> &screen,
+        ftl::cuda::TextureObject<uchar4> &colour,
+        int myid,
+        int count,
+        cudaStream_t stream);
+
+void normalise_aggregations(
+    ftl::cuda::TextureObject<float4> &norms,
+    ftl::cuda::TextureObject<float4> &cents,
+    ftl::cuda::TextureObject<float> &contribs,
+    cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/components/renderers/cpp/src/reprojection.cu b/components/renderers/cpp/src/reprojection.cu
index dc636f359..e6490c47e 100644
--- a/components/renderers/cpp/src/reprojection.cu
+++ b/components/renderers/cpp/src/reprojection.cu
@@ -66,7 +66,7 @@ __global__ void reprojection_kernel(
 		TextureObject<B> out,			// Accumulated output
 		TextureObject<float> contrib,
 		SplatParams params,
-		Camera camera, float4x4 poseInv) {
+		Camera camera, float4x4 transform, float3x3 transformR) {
         
 	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -74,10 +74,10 @@ __global__ void reprojection_kernel(
 	const float d = depth_in.tex2D((int)x, (int)y);
 	if (d < params.camera.minDepth || d > params.camera.maxDepth) return;
 
-	const float3 worldPos = params.m_viewMatrixInverse * params.camera.screenToCam(x, y, d);
+	//const float3 worldPos = params.m_viewMatrixInverse * params.camera.screenToCam(x, y, d);
 	//if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;
 
-	const float3 camPos = poseInv * worldPos;
+	const float3 camPos = transform * params.camera.screenToCam(x, y, d);
 	if (camPos.z < camera.minDepth) return;
 	if (camPos.z > camera.maxDepth) return;
 	const float2 screenPos = camera.camToScreen<float2>(camPos);
@@ -86,7 +86,7 @@ __global__ void reprojection_kernel(
 	if (screenPos.x >= depth_src.width() || screenPos.y >= depth_src.height()) return;
             
 	// Calculate the dot product of surface normal and camera ray
-	const float3 n = poseInv.getFloat3x3() * make_float3(normals.tex2D((int)x, (int)y));
+	const float3 n = transformR * make_float3(normals.tex2D((int)x, (int)y));
 	float3 ray = camera.screenToCam(screenPos.x, screenPos.y, 1.0f);
 	ray = ray / length(ray);
 	const float dotproduct = max(dot(ray,n),0.0f);
@@ -123,7 +123,7 @@ void ftl::cuda::reproject(
 		TextureObject<B> &out,   // Accumulated output
 		TextureObject<float> &contrib,
 		const SplatParams &params,
-		const Camera &camera, const float4x4 &poseInv, cudaStream_t stream) {
+		const Camera &camera, const float4x4 &transform, const float3x3 &transformR, cudaStream_t stream) {
 	const dim3 gridSize((out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
@@ -136,7 +136,8 @@ void ftl::cuda::reproject(
 		contrib,
 		params,
 		camera,
-		poseInv
+		transform,
+		transformR
     );
     cudaSafeCall( cudaGetLastError() );
 }
@@ -150,7 +151,7 @@ template void ftl::cuda::reproject(
 	ftl::cuda::TextureObject<float> &contrib,
 	const ftl::render::SplatParams &params,
 	const ftl::rgbd::Camera &camera,
-	const float4x4 &poseInv, cudaStream_t stream);
+	const float4x4 &transform, const float3x3 &transformR, cudaStream_t stream);
 
 template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float> &in,	// Original colour image
@@ -161,7 +162,7 @@ template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float> &contrib,
 		const ftl::render::SplatParams &params,
 		const ftl::rgbd::Camera &camera,
-		const float4x4 &poseInv, cudaStream_t stream);
+		const float4x4 &transform, const float3x3 &transformR, cudaStream_t stream);
 
 template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float4> &in,	// Original colour image
@@ -172,7 +173,7 @@ template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float> &contrib,
 		const ftl::render::SplatParams &params,
 		const ftl::rgbd::Camera &camera,
-		const float4x4 &poseInv, cudaStream_t stream);
+		const float4x4 &transform, const float3x3 &transformR, cudaStream_t stream);
 
 //==============================================================================
 //  Without normals
@@ -197,10 +198,10 @@ __global__ void reprojection_kernel(
 	const float d = depth_in.tex2D((int)x, (int)y);
 	if (d < params.camera.minDepth || d > params.camera.maxDepth) return;
 
-	const float3 worldPos = params.m_viewMatrixInverse * params.camera.screenToCam(x, y, d);
+	//const float3 worldPos = params.m_viewMatrixInverse * params.camera.screenToCam(x, y, d);
 	//if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;
 
-	const float3 camPos = poseInv * worldPos;
+	const float3 camPos = poseInv * params.camera.screenToCam(x, y, d);
 	if (camPos.z < camera.minDepth) return;
 	if (camPos.z > camera.maxDepth) return;
 	const float2 screenPos = camera.camToScreen<float2>(camPos);
diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp
index 3918a0eab..c81977cf3 100644
--- a/components/renderers/cpp/src/splatter_cuda.hpp
+++ b/components/renderers/cpp/src/splatter_cuda.hpp
@@ -80,7 +80,7 @@ namespace cuda {
 		ftl::cuda::TextureObject<float> &contrib,
 		const ftl::render::SplatParams &params,
 		const ftl::rgbd::Camera &camera,
-		const float4x4 &poseInv, cudaStream_t stream);
+		const float4x4 &transform, const float3x3 &transformR, cudaStream_t stream);
 
 	template <typename A, typename B>
 	void reproject(
diff --git a/components/renderers/cpp/src/tri_render.cpp b/components/renderers/cpp/src/tri_render.cpp
index d42d5a0e2..5912b6480 100644
--- a/components/renderers/cpp/src/tri_render.cpp
+++ b/components/renderers/cpp/src/tri_render.cpp
@@ -228,7 +228,8 @@ void Triangular::__reprojectChannel(ftl::rgbd::Frame &output, ftl::codecs::Chann
 			cv::cuda::cvtColor(tmp,col, cv::COLOR_BGR2BGRA);
 		}
 
-		auto poseInv = MatrixConversion::toCUDA(s->getPose().cast<float>().inverse());
+		auto transform = MatrixConversion::toCUDA(s->getPose().cast<float>().inverse()) * params_.m_viewMatrixInverse;
+		auto transformR = MatrixConversion::toCUDA(s->getPose().cast<float>().inverse()).getFloat3x3();
 
 		if (mesh_) {
 			ftl::cuda::reproject(
@@ -240,7 +241,7 @@ void Triangular::__reprojectChannel(ftl::rgbd::Frame &output, ftl::codecs::Chann
 				temp_.getTexture<float>(Channel::Contribution),
 				params_,
 				s->parameters(),
-				poseInv, stream
+				transform, transformR, stream
 			);
 		} else {
 			// Can't use normals with point cloud version
@@ -252,7 +253,7 @@ void Triangular::__reprojectChannel(ftl::rgbd::Frame &output, ftl::codecs::Chann
 				temp_.getTexture<float>(Channel::Contribution),
 				params_,
 				s->parameters(),
-				poseInv, stream
+				transform, stream
 			);
 		}
 	}
@@ -580,19 +581,19 @@ bool Triangular::render(ftl::rgbd::VirtualSource *src, ftl::rgbd::Frame &out) {
 	if (aligned_source >= 0 && aligned_source < scene_->frames.size()) {
 		// FIXME: Output may not be same resolution as source!
 		cudaSafeCall(cudaStreamSynchronize(stream_));
-		scene_->frames[aligned_source].copyTo(Channel::Depth + Channel::Colour + Channel::Smoothing, out);
+		scene_->frames[aligned_source].copyTo(Channel::Depth + Channel::Colour + Channel::Smoothing + Channel::Confidence, out);
 
-		if (chan == Channel::Normals) {
+		if (chan == Channel::ColourNormals) {
 			// Convert normal to single float value
-			temp_.create<GpuMat>(Channel::Colour, Format<uchar4>(out.get<GpuMat>(Channel::Colour).size())).setTo(cv::Scalar(0,0,0,0), cvstream);
-			ftl::cuda::normal_visualise(scene_->frames[aligned_source].getTexture<float4>(Channel::Normals), temp_.createTexture<uchar4>(Channel::Colour),
+			out.create<GpuMat>(Channel::ColourNormals, Format<uchar4>(out.get<GpuMat>(Channel::Colour).size())).setTo(cv::Scalar(0,0,0,0), cvstream);
+			ftl::cuda::normal_visualise(scene_->frames[aligned_source].getTexture<float4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals),
 					light_pos_,
 					light_diffuse_,
 					light_ambient_, stream_);
 
 			// Put in output as single float
-			cv::cuda::swap(temp_.get<GpuMat>(Channel::Colour), out.create<GpuMat>(Channel::Normals));
-			out.resetTexture(Channel::Normals);
+			//cv::cuda::swap(temp_.get<GpuMat>(Channel::Colour), out.create<GpuMat>(Channel::Normals));
+			//out.resetTexture(Channel::Normals);
 		}
 
 		return true;
diff --git a/components/rgbd-sources/include/ftl/rgbd/frame.hpp b/components/rgbd-sources/include/ftl/rgbd/frame.hpp
index d7b8292ff..e7a949600 100644
--- a/components/rgbd-sources/include/ftl/rgbd/frame.hpp
+++ b/components/rgbd-sources/include/ftl/rgbd/frame.hpp
@@ -233,7 +233,7 @@ ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, const
 
 template <typename T>
 ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, bool interpolated) {
-	if (!channels_.has(c)) throw ftl::exception("createTexture needs a format if the channel does not exist");
+	if (!channels_.has(c)) throw ftl::exception(ftl::Formatter() << "createTexture needs a format if the channel does not exist: " << (int)c);
 
 	auto &m = _get(c);
 
-- 
GitLab