diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index ca11bf526ef0e6e141861a79a15e2bf656106a5c..961728d88fb7101ee2b0e99ae8cbde89dd11f9ba 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -540,7 +540,7 @@ void ftl::gui::Camera::keyMovement(int key, int modifiers) {
 		float scalar = (key == 266) ? -mag : mag;
 		neye_ += rotmat_*Eigen::Vector4d(0.0,scalar,0.0,1.0);
 		return;
-	} else if (key >= '0' && key <= '5') {
+	} else if (key >= '0' && key <= '5' && modifiers == 2) {  // Ctrl+NUMBER
 		int ix = key - (int)('0');
 		transform_ix_ = ix-1;
 		return;
diff --git a/applications/gui/src/src_window.cpp b/applications/gui/src/src_window.cpp
index f86a36d58c0b4acd1a10180561760a832c382b7b..556d1f794419547b41ee18464ff0213a967bdb3a 100644
--- a/applications/gui/src/src_window.cpp
+++ b/applications/gui/src/src_window.cpp
@@ -30,6 +30,7 @@
 #include <ftl/operators/disparity.hpp>
 #include <ftl/operators/detectandtrack.hpp>
 #include <ftl/operators/weighting.hpp>
+#include <ftl/operators/mvmls.hpp>
 
 #include <nlohmann/json.hpp>
 
@@ -224,9 +225,10 @@ void SourceWindow::_checkFrameSets(int id) {
 		p->append<ftl::operators::CrossSupport>("cross");
 		p->append<ftl::operators::PixelWeights>("weights");
 		p->append<ftl::operators::CullWeight>("remove_weights")->value("enabled", false);
-		p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
 		p->append<ftl::operators::DegradeWeight>("degrade");
 		p->append<ftl::operators::VisCrossSupport>("viscross")->set("enabled", false);
+		p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false);
+		p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
 
 		pre_pipelines_.push_back(p);
 		framesets_.push_back(new ftl::rgbd::FrameSet);
diff --git a/components/common/cpp/include/ftl/cuda_texture.hpp b/components/common/cpp/include/ftl/cuda_texture.hpp
index e103b049750b0a39087fcd4dbd21100ddb7815b5..fa6c41866d20257ccd25faf62e3818cdeb3ce1c1 100644
--- a/components/common/cpp/include/ftl/cuda_texture.hpp
+++ b/components/common/cpp/include/ftl/cuda_texture.hpp
@@ -61,6 +61,8 @@ class TextureObjectBase {
 	__host__ void free();
 
 	inline int cvType() const { return cvType_; }
+
+	__host__ __device__ inline bool isValid() const { return ptr_ != nullptr; }
 	
 	protected:
 	cudaTextureObject_t texobj_;
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index a5e9ea7790c40538f55d3911e4fd1a67193736b6..c2e7b791df51bdf57bedf01e9fdeb312461cdaee 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -19,8 +19,11 @@ set(OPERSRC
 	src/mask.cpp
 	src/antialiasing.cpp
 	src/antialiasing.cu
-	src/mvmls.cpp
-	src/correspondence.cu
+	src/fusion/mvmls.cpp
+	src/fusion/correspondence.cu
+	src/fusion/correspondence_depth.cu
+	src/fusion/correspondence_util.cu
+	src/fusion/mls_aggr.cu
 	src/clipping.cpp
 	src/depth.cpp
 	src/detectandtrack.cpp
diff --git a/components/operators/include/ftl/operators/mask_cuda.hpp b/components/operators/include/ftl/operators/mask_cuda.hpp
index 53747d61fd82c96ec1eafdb4f51d3ad322db1222..f780d5178d80a0be6691648e618d48e4ce2031e5 100644
--- a/components/operators/include/ftl/operators/mask_cuda.hpp
+++ b/components/operators/include/ftl/operators/mask_cuda.hpp
@@ -73,6 +73,7 @@ void cull_mask(
 		ftl::cuda::TextureObject<ftl::cuda::Mask::type> &mask,
 		ftl::cuda::TextureObject<float> &depth,
 		ftl::cuda::Mask::type id,
+		bool invert,
 		unsigned int radius,
 		cudaStream_t stream);
 
diff --git a/components/operators/src/correspondence.cu b/components/operators/src/correspondence.cu
deleted file mode 100644
index 0b5ea46075627051b86ac25e1617258be22335a9..0000000000000000000000000000000000000000
--- a/components/operators/src/correspondence.cu
+++ /dev/null
@@ -1,641 +0,0 @@
-#include "mvmls_cuda.hpp"
-#include <ftl/cuda/weighting.hpp>
-#include <ftl/operators/mask_cuda.hpp>
-#include <ftl/cuda/warp.hpp>
-
-using ftl::cuda::TextureObject;
-using ftl::rgbd::Camera;
-using ftl::cuda::Mask;
-using ftl::cuda::MvMLSParams;
-
-#define T_PER_BLOCK 8
-#define WARP_SIZE 32
-#define INTERVAL 1
-
-template<int FUNCTION>
-__device__ float weightFunction(const ftl::cuda::MvMLSParams &params, float dweight, float cweight);
-
-template <>
-__device__ inline float weightFunction<0>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
-	return (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
-}
-
-template <>
-__device__ inline float weightFunction<1>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
-	return (cweight * cweight * dweight);
-}
-
-template <>
-__device__ inline float weightFunction<2>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
-	return (dweight * dweight * cweight);
-}
-
-template <>
-__device__ inline float weightFunction<3>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
-	return (dweight == 0.0f) ? 0.0f : (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
-}
-
-template <>
-__device__ inline float weightFunction<4>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
-	return cweight;
-}
-
-template <>
-__device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
-	return (cweight > 0.0f) ? dweight : 0.0f;
-}
-
-#ifndef PINF
-#define PINF __int_as_float(0x7f800000)
-#endif
-
-__device__ inline float distance(float4 p1, float4 p2) {
-	return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 10.0f);
-	//return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f);
-}
-
-__device__ inline int halfWarpCensus(float e) {
-	float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16 : 0, WARP_SIZE);
-	int c = (e > e0) ? 1 << (threadIdx.x % 16) : 0;
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
-		const int other = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
-		c |= other;
-	}
-	return c;
-}
-
-__device__ inline float halfWarpBest(float e, float c) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
-		const float o1 = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
-		const float o2 = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
-		e = (o2 > c) ? o1 : e;
-	}
-	return e;
-}
-
-__device__ inline float4 relativeDelta(const float4 &e) {
-	const float e0x = __shfl_sync(FULL_MASK, e.x, 0, WARP_SIZE/2);
-	const float e0y = __shfl_sync(FULL_MASK, e.y, 0, WARP_SIZE/2);
-	const float e0z = __shfl_sync(FULL_MASK, e.z, 0, WARP_SIZE/2);
-	return make_float4(e.x-e0x, e.y-e0y, e.z-e0z, 0.0f);
-}
-
-/**
- * See: Birchfield S. et al. (1998). A pixel dissimilarity measure that is
- * insensitive to image sampling. IEEE Transactions on Pattern Analysis and
- * Machine Intelligence.
- */
-__device__ float dissimilarity(const float4 &l, const float4 &rp, const float4 &rc, const float4 &rn) {
-	const float rpd = distance((rc - rp) * 0.5f + rp, l);
-	const float rnd = distance((rc - rn) * 0.5f + rn, l);
-	const float rcd = distance(rc, l);
-	return min(min(rpd, rnd), rcd);
-}
-
-
-template<int COR_STEPS, int FUNCTION> 
-__global__ void corresponding_point_kernel(
-        TextureObject<float> d1,
-        TextureObject<float> d2,
-        TextureObject<uchar4> c1,
-        TextureObject<uchar4> c2,
-        TextureObject<short2> screenOut,
-		TextureObject<float> conf,
-		TextureObject<uint8_t> mask,
-        float4x4 pose,
-        Camera cam1,
-        Camera cam2, ftl::cuda::MvMLSParams params) {
-	
-	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
-	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
-	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
-
-
-    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
-        screenOut(x,y) = make_short2(-1,-1);
-    
-        //const float3 world1 = make_float3(p1.tex2D(x, y));
-        const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z;  // Initial starting depth
-        if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return;
-
-        // TODO: Temporary hack to ensure depth1 is present
-        //const float4 temp = vout.tex2D(x,y);
-        //vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
-        
-        //const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
-
-        const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
-
-        //float bestdepth = 0.0f;
-        short2 bestScreen = make_short2(-1,-1);
-		//float bestdepth = 0.0f;
-		//float bestdepth2 = 0.0f;
-        float bestweight = 0.0f;
-        float bestcolour = 0.0f;
-        //float bestdweight = 0.0f;
-        float totalcolour = 0.0f;
-        //int count = 0;
-        //float contrib = 0.0f;
-		
-		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
-        const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
-        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 10.0f);
-        const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
-        const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
-		const float depthM = 10.0f / (lineDistant.x - lineOrigin.x);
-		const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
-        float2 linePos;
-        linePos.x = lineOrigin.x - ((COR_STEPS/2));
-        linePos.y = lineOrigin.y - (((COR_STEPS/2)) * lineM);
-		//float depthPos = depth1 - (float((COR_STEPS/2)) * depthM);
-		float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2);
-		//const float depthCoef = cam1.baseline*cam1.fx;
-
-		const float depthCoef = (1.0f / cam1.fx) * 10.0f;
-        
-        uint badMask = 0;
-        int bestStep = COR_STEPS/2;
-
-
-        // Project to p2 using cam2
-        // Each thread takes a possible correspondence and calculates a weighting
-        //const int lane = tid % WARP_SIZE;
-        for (int i=0; i<COR_STEPS; ++i) {			
-			//float weight = 1.0f; //(linePos.x >= cam2.width || linePos.y >= cam2.height) ? 0.0f : 1.0f;
-
-			// Generate a colour correspondence value
-            const auto colour2 = c2.tex2D(linePos.x, linePos.y);
-
-            // TODO: Check if other colour dissimilarities are better...
-            const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
-
-            // Generate a depth correspondence value
-            const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
-            
-            // Record which correspondences are invalid
-            badMask |= (
-					depth2 <= cam2.minDepth ||
-					depth2 >= cam2.maxDepth ||
-					linePos.x < 0.5f ||
-					linePos.y < 0.5f ||
-					linePos.x >= d2.width()-0.5f ||
-					linePos.y >= d2.height()-0.5f
-				) ? 1 << i : 0;
-			
-			//if (FUNCTION == 1) {
-			// TODO: Spatial smooth must be disparity discon threshold of largest depth in original camera space.
-			// ie. if depth1 > depth2 then depth1 has the largest error potential and the resolution at that
-			// depth is used to determine the spatial smoothing amount.
-	
-			const float maxdepth = max(depth1, depth2);
-			const float smooth = depthCoef * maxdepth;
-			float weight = ftl::cuda::weighting(fabs(depth2 - depthPos2), cweight*smooth);
-			//weight = ftl::cuda::halfWarpSum(weight);
-			//} else {
-			//	const float dweight = ftl::cuda::weighting(fabs(depth2 - depthPos2), params.spatial_smooth);
-            //	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-			//}
-            //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
-
-            //weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-
-            //++count;
-
-            bestcolour = max(cweight, bestcolour);
-            totalcolour += cweight;
-
-            //bestdepth = (weight > bestweight) ? depthPos : bestdepth;
-            bestStep = (weight > bestweight) ? i : bestStep;
-			
-			bestweight = max(bestweight, weight);
-                
-			
-			//depthPos += depthM;
-			depthPos2 += depthM2;
-            linePos.x += 1.0f;
-            linePos.y += lineM;
-        }
-
-        //const float avgcolour = totalcolour/(float)count;
-        const float confidence = ((bestcolour / totalcolour) - (1.0f / 16.0f)) * (1.0f + (1.0f/16.0f));
-        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
-
-        // Detect matches to boundaries, and discard those
-        uint stepMask = 1 << bestStep;
-		if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestweight = 0.0f;
-		
-		//bestadjust = halfWarpBest(bestadjust, (bestweight > 0.0f) ? confidence : 0.0f);
-
-        //Mask m(mask.tex2D(x,y));
-
-        //if (bestweight > 0.0f) {
-            float old = conf.tex2D(x,y);
-
-            if (bestweight > 0.0f) {
-				d1(x,y) = (0.4f*bestadjust) + depth1;
-				//d2(bestScreen.x, bestScreen.y) = bestdepth2;
-                //screenOut(x,y) = bestScreen;
-				conf(x,y) = max(old,confidence); //bestweight * confidence;
-				//conf(x,y) = max(old,fabs(bestadjust));
-            }
-        //}
-        
-        // If a good enough match is found, mark dodgy depth as solid
-        //if ((m.isFilled() || m.isDiscontinuity()) && (bestweight > params.match_threshold)) mask(x,y) = 0;
-    }
-}
-
-void ftl::cuda::correspondence(
-        TextureObject<float> &d1,
-        TextureObject<float> &d2,
-        TextureObject<uchar4> &c1,
-        TextureObject<uchar4> &c2,
-        TextureObject<short2> &screen,
-		TextureObject<float> &conf,
-		TextureObject<uint8_t> &mask,
-        float4x4 &pose2,
-        const Camera &cam1,
-        const Camera &cam2, const MvMLSParams &params, int func,
-        cudaStream_t stream) {
-
-	//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(WARP_SIZE, 2);
-
-    //printf("COR SIZE %d,%d\n", p1.width(), p1.height());
-
-	/*switch (func) {
-    case 0: corresponding_point_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-	case 1: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-	case 2: corresponding_point_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-	case 3: corresponding_point_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-	case 4: corresponding_point_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-	case 5: corresponding_point_kernel<16,5><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    }*/
-    
-    switch (func) {
-    case 32: corresponding_point_kernel<32,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 16: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 8: corresponding_point_kernel<8,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 4: corresponding_point_kernel<4,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 2: corresponding_point_kernel<2,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    }
-
-    cudaSafeCall( cudaGetLastError() );
-}
-
-// ==== Remove zero-confidence =================================================
-
-__global__ void zero_confidence_kernel(
-		TextureObject<float> conf,
-		TextureObject<float> depth) {
-
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	if (x < depth.width() && y < depth.height()) {
-		const float c = conf.tex2D((int)x,(int)y);
-
-		if (c == 0.0f) {
-			depth(x,y) = 1000.0f;	
-		}
-	}
-}
-
-void ftl::cuda::zero_confidence(TextureObject<float> &conf, TextureObject<float> &depth, cudaStream_t stream) {
-	const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	zero_confidence_kernel<<<gridSize, blockSize, 0, stream>>>(conf, depth);
-	cudaSafeCall( cudaGetLastError() );
-}
-
-
-// ==== MultiViewMLS Aggregate =================================================
-
-__device__ inline short3 getScreenPos(int x, int y, float d, const Camera &cam1, const Camera &cam2, const float4x4 &transform) {
-    const float3 campos = transform * cam1.screenToCam(x,y,d);
-    const int2 screen = cam2.camToScreen<int2>(campos);
-    return make_short3(screen.x, screen.y, campos.z);
-}
-
-__device__ inline short2 packScreen(int x, int y, int id) {
-    return make_short2((id << 12) + x, y);
-}
-
-__device__ inline short2 packScreen(const short3 &p, int id) {
-    return make_short2((id << 12) + p.x, p.y);
-}
-
-__device__ inline int supportSize(uchar4 support) {
-    return (support.x+support.y) * (support.z+support.w);
-}
-
-__device__ inline short2 choosePoint(uchar4 sup1, uchar4 sup2, float dot1, float dot2, short2 screen1, short2 screen2) {
-    //return (float(supportSize(sup2))*dot1 > float(supportSize(sup1))*dot2) ? screen2 : screen1;
-    return (dot1 > dot2) ? screen2 : screen1;
-}
-
-__device__ inline int unpackCameraID(short2 p) {
-    return p.x >> 12;
-}
-
-/**
- * Identify which source has the best support region for a given pixel.
- */
-__global__ void best_sources_kernel(
-        TextureObject<half4> normals1,
-        TextureObject<half4> normals2,
-        TextureObject<uchar4> support1,
-        TextureObject<uchar4> support2,
-        TextureObject<float> depth1,
-        TextureObject<float> depth2,
-        TextureObject<short2> screen,
-        float4x4 transform,
-        //float3x3 transformR,
-        ftl::rgbd::Camera cam1,
-        ftl::rgbd::Camera cam2,
-        int id1,
-        int id2) {
-
-    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
-    const int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (x >= 0 && y >= 0 && x < screen.width() && y < screen.height()) {
-        const float d1 = depth1.tex2D(x,y);
-
-        const short3 scr2 = getScreenPos(x, y, d1, cam1, cam2, transform);
-        short2 bestPoint = packScreen(x,y,0);
-
-        if (scr2.x >= 0 && scr2.y >= 0 && scr2.x < cam2.width && scr2.y < cam2.height) {
-            uchar4 sup1 = support1.tex2D(x,y);
-            uchar4 sup2 = support2.tex2D(scr2.x,scr2.y);
-            const float d2 = depth2.tex2D(scr2.x,scr2.y);
-            float3 n1 = transform.getFloat3x3() * make_float3(normals1.tex2D(x,y));
-            float3 n2 = make_float3(normals2.tex2D(scr2.x,scr2.y));
-
-            float3 camray = cam2.screenToCam(scr2.x,scr2.y,1.0f);
-            camray /= length(camray);
-            const float dot1 = dot(camray, n1);
-            const float dot2 = dot(camray, n2);
-
-            bestPoint = (fabs(scr2.z - d2) < 0.04f) ? choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2)) : packScreen(x,y,6);
-            //bestPoint = choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2));
-			//bestPoint = (d1 < d2) ? packScreen(x,y,id1) : packScreen(x,y,id2);
-			
-			bestPoint = (fabs(scr2.z - d2) < 0.04f) ? packScreen(scr2,id2) : packScreen(scr2,id1);
-        }
-
-        screen(x,y) = bestPoint;
-
-        /*if (s.x >= 0 && s.y >= 0) {
-            auto norm1 = make_float3(n1.tex2D(x,y));
-            const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
-            //n2(s.x,s.y) = norm1;
-
-            float3 cent1 = make_float3(c1.tex2D(x,y));
-            const auto cent2 = make_float3(c2.tex2D(s.x,s.y));
-
-            if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f) {
-                norm1 += poseInv1.getFloat3x3() * (pose2.getFloat3x3() * norm2);
-                n1(x,y) = make_float4(norm1, 0.0f);
-				cent1 +=  poseInv1 * (pose2 * cent2);  // FIXME: Transform between camera spaces
-				cent1 /= 2.0f;
-                c1(x,y) = make_float4(cent1, 0.0f);
-                //c2(s.x,s.y) = cent1;
-
-				//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
-            }
-           // contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
-        }*/
-    }
-}
-
-void ftl::cuda::best_sources(
-        ftl::cuda::TextureObject<half4> &normals1,
-        ftl::cuda::TextureObject<half4> &normals2,
-        ftl::cuda::TextureObject<uchar4> &support1,
-        ftl::cuda::TextureObject<uchar4> &support2,
-        ftl::cuda::TextureObject<float> &depth1,
-        ftl::cuda::TextureObject<float> &depth2,
-        ftl::cuda::TextureObject<short2> &screen,
-        const float4x4 &transform,
-        const ftl::rgbd::Camera &cam1,
-        const ftl::rgbd::Camera &cam2,
-        int id1,
-        int id2,
-        cudaStream_t stream) {
-
-    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-    best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(normals1, normals2, support1, support2, depth1, depth2, screen, transform, cam1, cam2, id1, id2);
-    cudaSafeCall( cudaGetLastError() );
-}
-
-/**
- * Identify which source has the best support region for a given pixel.
- */
- __global__ void aggregate_sources_kernel(
-		TextureObject<half4> n1,
-		TextureObject<half4> n2,
-		TextureObject<float4> c1,
-		TextureObject<float4> c2,
-		TextureObject<float> depth1,
-		//TextureObject<float> depth2,
-		//TextureObject<short2> screen,
-		float4x4 transform,
-		//float3x3 transformR,
-		ftl::rgbd::Camera cam1,
-		ftl::rgbd::Camera cam2) {
-
-	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
-	const int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	if (x >= 0 && y >= 0 && x < n1.width() && y < n1.height()) {
-		const float d1 = depth1.tex2D(x,y);
-
-		if (d1 > cam1.minDepth && d1 < cam1.maxDepth) {
-			//const short3 s = getScreenPos(x, y, d1, cam1, cam2, transform);
-			const float3 camPos = transform * cam1.screenToCam(x, y, d1);
-			const int2 s = cam2.camToScreen<int2>(camPos);
-
-			if (s.x >= 0 && s.y >= 0 && s.x < n2.width() && s.y < n2.height()) {
-				auto norm1 = make_float3(n1.tex2D(x,y));
-				const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
-				//n2(s.x,s.y) = norm1;
-
-				float3 cent1 = make_float3(c1.tex2D(x,y));
-				const auto cent2 = transform.getInverse() * make_float3(c2.tex2D(s.x,s.y));
-
-				//printf("MERGING %f\n", length(cent2-cent1));
-
-				if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f && length(cent2-cent1) < 0.04f) {
-					norm1 += norm2;
-					norm1 /= 2.0f;
-					n1(x,y) = make_half4(norm1, 0.0f);
-					cent1 += cent2;
-					cent1 /= 2.0f;
-					c1(x,y) = make_float4(cent1, 0.0f);
-					//c2(s.x,s.y) = cent1;
-
-					//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
-				}
-			// contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
-			}
-		}
-	}
-}
-
-void ftl::cuda::aggregate_sources(
-		ftl::cuda::TextureObject<half4> &n1,
-		ftl::cuda::TextureObject<half4> &n2,
-		ftl::cuda::TextureObject<float4> &c1,
-		ftl::cuda::TextureObject<float4> &c2,
-		ftl::cuda::TextureObject<float> &depth1,
-		//ftl::cuda::TextureObject<float> &depth2,
-		//ftl::cuda::TextureObject<short2> &screen,
-		const float4x4 &transform,
-		const ftl::rgbd::Camera &cam1,
-		const ftl::rgbd::Camera &cam2,
-		cudaStream_t stream) {
-
-	const dim3 gridSize((n1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (n1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, depth1, transform, cam1, cam2);
-	cudaSafeCall( cudaGetLastError() );
-}
-
-__device__ static uchar4 HSVtoRGB(int H, float S, float V) {
-	const float C = S * V;
-	const float X = C * (1 - fabs(fmodf(H / 60.0f, 2) - 1));
-	const float m = V - C;
-	float Rs, Gs, Bs;
-
-	if(H >= 0 && H < 60) {
-		Rs = C;
-		Gs = X;
-		Bs = 0;	
-	}
-	else if(H >= 60 && H < 120) {	
-		Rs = X;
-		Gs = C;
-		Bs = 0;	
-	}
-	else if(H >= 120 && H < 180) {
-		Rs = 0;
-		Gs = C;
-		Bs = X;	
-	}
-	else if(H >= 180 && H < 240) {
-		Rs = 0;
-		Gs = X;
-		Bs = C;	
-	}
-	else if(H >= 240 && H < 300) {
-		Rs = X;
-		Gs = 0;
-		Bs = C;	
-	}
-	else {
-		Rs = C;
-		Gs = 0;
-		Bs = X;	
-	}
-
-	return make_uchar4((Bs + m) * 255, (Gs + m) * 255, (Rs + m) * 255, 0);
-}
-
-/**
- * Render each pixel is a colour corresponding to the source camera with the
- * best support window.
- */
- __global__ void vis_best_sources_kernel(
-        TextureObject<short2> screen,
-        TextureObject<uchar4> colour,
-        int myid,
-        int count) {
-
-    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
-    const int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (x >= 0 && y >= 0 && x < colour.width() && y < colour.height()) {
-        short2 s = screen.tex2D(x,y);
-        int id = unpackCameraID(s);
-
-        uchar4 c = HSVtoRGB((360 / count) * id, 0.6f, 0.85f);
-        if (myid != id) colour(x,y) = c;
-        //colour(x,y) = c;
-    }
-}
-
-void ftl::cuda::vis_best_sources(
-        ftl::cuda::TextureObject<short2> &screen,
-        ftl::cuda::TextureObject<uchar4> &colour,
-        int myid,
-        int count,
-        cudaStream_t stream) {
-
-    const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-    vis_best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(screen, colour, myid, count);
-    cudaSafeCall( cudaGetLastError() );
-}
-
-/*void ftl::cuda::aggregate_sources(
-        ftl::cuda::TextureObject<float4> &n1,
-        ftl::cuda::TextureObject<float4> &n2,
-        ftl::cuda::TextureObject<float4> &c1,
-        ftl::cuda::TextureObject<float4> &c2,
-        ftl::cuda::TextureObject<float> &contribs1,
-        ftl::cuda::TextureObject<float> &contribs2,
-		ftl::cuda::TextureObject<short2> &screen,
-		const float4x4 &poseInv1,
-		const float4x4 &pose2,
-        cudaStream_t stream) {
-
-    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-    aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, contribs1, contribs2, screen, poseInv1, pose2);
-    cudaSafeCall( cudaGetLastError() );
-}*/
-
-// ==== Normalise aggregations =================================================
-
-__global__ void normalise_aggregations_kernel(
-        TextureObject<half4> norms,
-        TextureObject<float4> cents,
-        TextureObject<float> contribs) {
-
-    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (x < norms.width() && y < norms.height()) {
-        const float contrib = contribs.tex2D((int)x,(int)y);
-
-        const auto a = make_float3(norms.tex2D((int)x,(int)y));
-        const auto b = cents.tex2D(x,y);
-        //const float4 normal = normals.tex2D((int)x,(int)y);
-
-		//out(x,y) = (contrib == 0.0f) ? make<B>(a) : make<B>(a / contrib);
-
-        if (contrib > 0.0f) {
-            norms(x,y) = make_half4(a / (contrib+1.0f), 1.0f);
-            cents(x,y) = b / (contrib+1.0f);
-        }
-    }
-}
-
-void ftl::cuda::normalise_aggregations(TextureObject<half4> &norms, TextureObject<float4> &cents, TextureObject<float> &contribs, cudaStream_t stream) {
-    const dim3 gridSize((norms.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norms.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-    normalise_aggregations_kernel<<<gridSize, blockSize, 0, stream>>>(norms, cents, contribs);
-    cudaSafeCall( cudaGetLastError() );
-}
-
diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu
index a2c60def9e436f7cdb007fb15e4275fdcdb9e620..349920dd2d21f8a330c8f2de7ce6aea5e9672185 100644
--- a/components/operators/src/disparity/disp2depth.cu
+++ b/components/operators/src/disparity/disp2depth.cu
@@ -8,7 +8,7 @@ __global__ void d2d_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<
 	for (STRIDE_Y(v,disp.rows)) {
 	for (STRIDE_X(u,disp.cols)) {
 		float d = disp(v,u);
-		depth(v,u) = (d == 0) ? 50.0f : ((cam.baseline*cam.fx) / (d - cam.doffs));
+		depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / (d - cam.doffs));
 	}
 	}
 }
diff --git a/components/operators/src/fusion/correspondence.cu b/components/operators/src/fusion/correspondence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2df21a24daa6f04366152837367593c34183ead4
--- /dev/null
+++ b/components/operators/src/fusion/correspondence.cu
@@ -0,0 +1,276 @@
+#include "mvmls_cuda.hpp"
+#include <ftl/cuda/weighting.hpp>
+#include <ftl/operators/mask_cuda.hpp>
+#include <ftl/cuda/warp.hpp>
+
+using ftl::cuda::TextureObject;
+using ftl::rgbd::Camera;
+using ftl::cuda::Mask;
+using ftl::cuda::MvMLSParams;
+
+#define T_PER_BLOCK 8
+#define WARP_SIZE 32
+#define INTERVAL 1
+
+#include "correspondence_common.hpp"
+
+template<int FUNCTION>
+__device__ float weightFunction(const ftl::cuda::MvMLSParams &params, float dweight, float cweight);
+
+/*template <>
+__device__ inline float weightFunction<0>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<1>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
+	return (cweight * cweight * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<2>(const ftl::cuda::MvMLSParams &param, float dweight, float cweight) {
+	return (dweight * dweight * cweight);
+}
+
+template <>
+__device__ inline float weightFunction<3>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (dweight == 0.0f) ? 0.0f : (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<4>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return cweight;
+}
+
+template <>
+__device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams &params, float dweight, float cweight) {
+	return (cweight > 0.0f) ? dweight : 0.0f;
+}*/
+
+__device__ inline float distance(float4 p1, float4 p2) {
+	return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 10.0f);
+	//return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f);
+}
+
+__device__ inline int halfWarpCensus(float e) {
+	float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16 : 0, WARP_SIZE);
+	int c = (e > e0) ? 1 << (threadIdx.x % 16) : 0;
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const int other = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
+		c |= other;
+	}
+	return c;
+}
+
+__device__ inline float halfWarpBest(float e, float c) {
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const float o1 = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		const float o2 = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
+		e = (o2 > c) ? o1 : e;
+	}
+	return e;
+}
+
+__device__ inline float4 relativeDelta(const float4 &e) {
+	const float e0x = __shfl_sync(FULL_MASK, e.x, 0, WARP_SIZE/2);
+	const float e0y = __shfl_sync(FULL_MASK, e.y, 0, WARP_SIZE/2);
+	const float e0z = __shfl_sync(FULL_MASK, e.z, 0, WARP_SIZE/2);
+	return make_float4(e.x-e0x, e.y-e0y, e.z-e0z, 0.0f);
+}
+
+/**
+ * See: Birchfield S. et al. (1998). A pixel dissimilarity measure that is
+ * insensitive to image sampling. IEEE Transactions on Pattern Analysis and
+ * Machine Intelligence.
+ */
+__device__ float dissimilarity(const float4 &l, const float4 &rp, const float4 &rc, const float4 &rn) {
+	const float rpd = distance((rc - rp) * 0.5f + rp, l);
+	const float rnd = distance((rc - rn) * 0.5f + rn, l);
+	const float rcd = distance(rc, l);
+	return min(min(rpd, rnd), rcd);
+}
+
+
+
+template<int COR_STEPS, int FUNCTION> 
+__global__ void corresponding_point_kernel(
+        TextureObject<float> d1,
+        TextureObject<float> d2,
+        TextureObject<uchar4> c1,
+        TextureObject<uchar4> c2,
+        TextureObject<short2> screenOut,
+		TextureObject<float> conf,
+		TextureObject<uint8_t> mask,
+        float4x4 pose,
+        Camera cam1,
+        Camera cam2, ftl::cuda::MvMLSParams params) {
+	
+	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
+	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
+
+
+    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
+        screenOut(x,y) = make_short2(-1,-1);
+    
+        //const float3 world1 = make_float3(p1.tex2D(x, y));
+        const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z;  // Initial starting depth
+        if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return;
+
+        // TODO: Temporary hack to ensure depth1 is present
+        //const float4 temp = vout.tex2D(x,y);
+        //vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
+        
+        //const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
+
+        const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
+
+        //float bestdepth = 0.0f;
+        short2 bestScreen = make_short2(-1,-1);
+		//float bestdepth = 0.0f;
+		//float bestdepth2 = 0.0f;
+        float bestweight = 0.0f;
+        float bestcolour = 0.0f;
+        //float bestdweight = 0.0f;
+        float totalcolour = 0.0f;
+        //int count = 0;
+        //float contrib = 0.0f;
+		
+		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
+        const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
+        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 10.0f);
+        const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
+        const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
+		const float depthM = 10.0f / (lineDistant.x - lineOrigin.x);
+		const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+        float2 linePos;
+        linePos.x = lineOrigin.x - ((COR_STEPS/2));
+        linePos.y = lineOrigin.y - (((COR_STEPS/2)) * lineM);
+		//float depthPos = depth1 - (float((COR_STEPS/2)) * depthM);
+		float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2);
+		//const float depthCoef = cam1.baseline*cam1.fx;
+
+		const float depthCoef = (1.0f / cam1.fx) * 10.0f;
+        
+        uint badMask = 0;
+        int bestStep = COR_STEPS/2;
+
+
+        // Project to p2 using cam2
+        // Each thread takes a possible correspondence and calculates a weighting
+        //const int lane = tid % WARP_SIZE;
+        for (int i=0; i<COR_STEPS; ++i) {			
+			//float weight = 1.0f; //(linePos.x >= cam2.width || linePos.y >= cam2.height) ? 0.0f : 1.0f;
+
+			// Generate a colour correspondence value
+            const auto colour2 = c2.tex2D(linePos.x, linePos.y);
+
+            // TODO: Check if other colour dissimilarities are better...
+            const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
+
+            // Generate a depth correspondence value
+            const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
+            
+            // Record which correspondences are invalid
+            badMask |= (
+					depth2 <= cam2.minDepth ||
+					depth2 >= cam2.maxDepth ||
+					linePos.x < 0.5f ||
+					linePos.y < 0.5f ||
+					linePos.x >= d2.width()-0.5f ||
+					linePos.y >= d2.height()-0.5f
+				) ? 1 << i : 0;
+			
+			//if (FUNCTION == 1) {
+			// TODO: Spatial smooth must be disparity discon threshold of largest depth in original camera space.
+			// ie. if depth1 > depth2 then depth1 has the largest error potential and the resolution at that
+			// depth is used to determine the spatial smoothing amount.
+	
+			const float maxdepth = max(depth1, depth2);
+			const float smooth = depthCoef * maxdepth;
+			float weight = ftl::cuda::weighting(fabs(depth2 - depthPos2), cweight*smooth);
+			//weight = ftl::cuda::halfWarpSum(weight);
+			//} else {
+			//	const float dweight = ftl::cuda::weighting(fabs(depth2 - depthPos2), params.spatial_smooth);
+            //	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
+			//}
+            //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
+
+            //weight *= weightFunction<FUNCTION>(params, dweight, cweight);
+
+            //++count;
+
+            bestcolour = max(cweight, bestcolour);
+            totalcolour += cweight;
+
+            //bestdepth = (weight > bestweight) ? depthPos : bestdepth;
+            bestStep = (weight > bestweight) ? i : bestStep;
+			
+			bestweight = max(bestweight, weight);
+                
+			
+			//depthPos += depthM;
+			depthPos2 += depthM2;
+            linePos.x += 1.0f;
+            linePos.y += lineM;
+        }
+
+        //const float avgcolour = totalcolour/(float)count;
+        const float confidence = ((bestcolour / totalcolour) - (1.0f / 16.0f)) * (1.0f + (1.0f/16.0f));
+        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+
+        // Detect matches to boundaries, and discard those
+        uint stepMask = 1 << bestStep;
+		if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestweight = 0.0f;
+		
+		//bestadjust = halfWarpBest(bestadjust, (bestweight > 0.0f) ? confidence : 0.0f);
+
+        //Mask m(mask.tex2D(x,y));
+
+        //if (bestweight > 0.0f) {
+            float old = conf.tex2D(x,y);
+
+            if (bestweight > 0.0f) {
+				d1(x,y) = (0.4f*bestadjust) + depth1;
+				//d2(bestScreen.x, bestScreen.y) = bestdepth2;
+                //screenOut(x,y) = bestScreen;
+				conf(x,y) = max(old,confidence); //bestweight * confidence;
+				//conf(x,y) = max(old,fabs(bestadjust));
+            }
+        //}
+        
+        // If a good enough match is found, mark dodgy depth as solid
+        //if ((m.isFilled() || m.isDiscontinuity()) && (bestweight > params.match_threshold)) mask(x,y) = 0;
+    }
+}
+
+void ftl::cuda::correspondence(
+        TextureObject<float> &d1,
+        TextureObject<float> &d2,
+        TextureObject<uchar4> &c1,
+        TextureObject<uchar4> &c2,
+        TextureObject<short2> &screen,
+		TextureObject<float> &conf,
+		TextureObject<uint8_t> &mask,
+        float4x4 &pose2,
+        const Camera &cam1,
+        const Camera &cam2, const MvMLSParams &params, int func,
+        cudaStream_t stream) {
+
+	//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(WARP_SIZE, 2);
+
+    
+    switch (func) {
+    case 32: corresponding_point_kernel<32,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 16: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 8: corresponding_point_kernel<8,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 4: corresponding_point_kernel<4,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 2: corresponding_point_kernel<2,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    }
+
+    cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/fusion/correspondence_common.hpp b/components/operators/src/fusion/correspondence_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b9405f474bf7ed00fc38d7e3632b826922c4069
--- /dev/null
+++ b/components/operators/src/fusion/correspondence_common.hpp
@@ -0,0 +1,23 @@
+#ifndef _FTL_OPERATORS_MVMLS_CORCOMMON_HPP_
+#define _FTL_OPERATORS_MVMLS_CORCOMMON_HPP_
+
+#ifndef PINF
+#define PINF __int_as_float(0x7f800000)
+#endif
+
+/**
+ * If the corresponding point has no depth or is off screen then it is bad.
+ */
+__device__ inline bool isBadCor(float depth, const float2 &pos, const ftl::rgbd::Camera &cam) {
+	return
+		depth <= cam.minDepth ||
+		depth >= cam.maxDepth ||
+		pos.x < 0.5f ||
+		pos.y < 0.5f ||
+		pos.x >= cam.width-0.5f ||
+		pos.y >= cam.height-0.5f;
+}
+
+__device__ inline float square(float v) { return v*v; }
+
+#endif
diff --git a/components/operators/src/fusion/correspondence_depth.cu b/components/operators/src/fusion/correspondence_depth.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5331526175861b3280d121da2daf2532faa73aee
--- /dev/null
+++ b/components/operators/src/fusion/correspondence_depth.cu
@@ -0,0 +1,146 @@
+#include "mvmls_cuda.hpp"
+#include <ftl/cuda/weighting.hpp>
+#include <ftl/operators/mask_cuda.hpp>
+#include <ftl/cuda/warp.hpp>
+
+using ftl::cuda::TextureObject;
+using ftl::rgbd::Camera;
+using ftl::cuda::Mask;
+using ftl::cuda::MvMLSParams;
+
+#define T_PER_BLOCK 8
+#define WARP_SIZE 32
+
+#include "correspondence_common.hpp"
+
+template<int COR_STEPS> 
+__global__ void corresponding_depth_kernel(
+        TextureObject<float> d1,
+        TextureObject<float> d2,
+        TextureObject<short2> screenOut,
+		TextureObject<float> conf,
+		TextureObject<uint8_t> mask,
+        float4x4 pose,
+        Camera cam1,
+        Camera cam2, ftl::cuda::MvMLSParams params) {
+	
+	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
+	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
+
+	// FIXME: Don't return like this due to warp operations
+    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
+		screenOut(x,y) = make_short2(-1,-1);
+		conf(x,y) = 0.0f;
+    
+		const float depth1 = d1.tex2D(x,y);
+		// TODO: If all depths within a half warp here are bad then can return
+		// early.
+		//if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return;
+
+        short2 bestScreen = make_short2(-1,-1);
+        float bestcost = PINF;
+		
+		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
+        const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
+        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 0.4f);
+        const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
+        const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
+		const float depthM = 0.4f / (lineDistant.x - lineOrigin.x);
+		const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+        float2 linePos;
+        linePos.x = lineOrigin.x - float((COR_STEPS/2));
+        linePos.y = lineOrigin.y - (float((COR_STEPS/2)) * lineM);
+		float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2);
+
+		// Pre-calculate a depth resolution coeficient
+		const float depthCoef = (1.0f / (cam1.baseline *cam1.fx)) * params.spatial_smooth;
+        
+        uint badMask = 0;
+        int bestStep = COR_STEPS/2;
+		//float bestSource = 0.0f;
+
+		// Scan from -COR_STEPS/2 to +COR_STEPS/2, where COR_STEPS is the number
+		// of pixels to check in the second camera for the current first camera
+		// pixel.
+		// TODO: Could consider sub pixel accuracy using linear interpolation
+		// between the depths of subsequent pixels to find a better minimum.
+		// This depends on GPU resource limitations for the calculation.
+
+		#pragma unroll
+        for (int i=0; i<COR_STEPS; ++i) {			
+			// Read the actual depth at current correspondence check point
+            const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
+            
+			// Record which correspondences are invalid
+            badMask |= (isBadCor(depth2, linePos, cam2)) ? 1 << i : 0;
+	
+			// Sum of square of normalised depth error from both cameras
+			// Normalised using depth resolution in respective camera, this
+			// should bias in favour of moving the lower res camera point
+			// Effectively this is a squared distance from ideal location.
+			float cost = square(min(1.0f, fabs(depth2-depthPos2) / (depthCoef * depth2 * depth2)));
+			cost += square(min(1.0f, (fabs(float(i-COR_STEPS/2))*depthM) / (depthCoef * depth1 * depth1)));
+			
+			// TODO: Perhaps allow per pixel variations but with additional cost
+			// the greater they are from the warp concensus best cost
+
+			// Sum all squared distance errors in a 4x4 pixel neighborhood
+			cost = ftl::cuda::halfWarpSum(cost);
+
+			// Record the best result
+            bestStep = (cost < bestcost) ? i : bestStep;
+			bestScreen = (cost < bestcost) ? make_short2(linePos.x+0.5f,linePos.y+0.5f) : bestScreen;
+			bestcost = min(bestcost, cost);
+                
+			
+			// Move to next correspondence check point
+			depthPos2 += depthM2;
+            linePos.x += 1.0f;
+            linePos.y += lineM;
+        }
+
+        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+
+        // Detect matches to boundaries, and discard those
+        uint stepMask = 1 << bestStep;
+		if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestcost = PINF;
+
+		// Check that no boundaries were matched in 4x4 block
+		bestcost = ftl::cuda::halfWarpMax(bestcost);
+		
+		// If a match was found then record it
+		if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < PINF) {
+			// Delay making the depth change until later.
+			conf(x,y) = bestadjust;
+			mask(x,y) = mask(x,y) | Mask::kMask_Correspondence;
+			screenOut(x,y) = bestScreen;
+		}
+    }
+}
+
+void ftl::cuda::correspondence(
+		TextureObject<float> &d1,
+		TextureObject<float> &d2,
+		TextureObject<short2> &screen,
+		TextureObject<float> &conf,
+		TextureObject<uint8_t> &mask,
+		float4x4 &pose2,
+		const Camera &cam1,
+		const Camera &cam2, const MvMLSParams &params, int func,
+		cudaStream_t stream) {
+
+	// FIXME: Check the grid size makes sense
+	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(WARP_SIZE, 2);
+
+	switch (func) {
+	case 32: corresponding_depth_kernel<32><<<gridSize, blockSize, 0, stream>>>(d1, d2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 16: corresponding_depth_kernel<16><<<gridSize, blockSize, 0, stream>>>(d1, d2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 8: corresponding_depth_kernel<8><<<gridSize, blockSize, 0, stream>>>(d1, d2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 4: corresponding_depth_kernel<4><<<gridSize, blockSize, 0, stream>>>(d1, d2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 2: corresponding_depth_kernel<2><<<gridSize, blockSize, 0, stream>>>(d1, d2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	}
+
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/fusion/correspondence_util.cu b/components/operators/src/fusion/correspondence_util.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca1b8a82a81be469de9b8e87d20cc8260297578e
--- /dev/null
+++ b/components/operators/src/fusion/correspondence_util.cu
@@ -0,0 +1,100 @@
+#include "mvmls_cuda.hpp"
+#include <ftl/cuda/weighting.hpp>
+#include <ftl/operators/mask_cuda.hpp>
+#include <ftl/cuda/warp.hpp>
+
+using ftl::cuda::TextureObject;
+using ftl::rgbd::Camera;
+using ftl::cuda::Mask;
+using ftl::cuda::MvMLSParams;
+
+#define T_PER_BLOCK 8
+#define WARP_SIZE 32
+
+#include "correspondence_common.hpp"
+
+// ==== Remove zero-confidence =================================================
+
+__global__ void zero_confidence_kernel(
+		TextureObject<float> conf,
+		TextureObject<float> depth) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		const float c = conf.tex2D((int)x,(int)y);
+
+		if (c == 0.0f) {
+			depth(x,y) = 1000.0f;	
+		}
+	}
+}
+
+void ftl::cuda::zero_confidence(TextureObject<float> &conf, TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	zero_confidence_kernel<<<gridSize, blockSize, 0, stream>>>(conf, depth);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Show correspondence errors =============================================
+
+__global__ void show_cor_error_kernel(
+		TextureObject<uchar4> colour,
+		TextureObject<short2> screen1,
+		TextureObject<short2> screen2) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < colour.width() && y < colour.height()) {
+		short2 s1 = screen1.tex2D(x,y);
+
+		if (s1.x >= 0 && s1.x < screen2.width() && s1.y < screen2.height()) {
+			short2 s2 = screen2.tex2D(s1.x, s1.y);
+
+			float l = sqrt(square(s2.x-x) + square(s2.y-y));
+			float nl = min(1.0f, l/5.0f);
+			//colour(x,y) = (l < 1.0f) ? make_uchar4(0,255,0,0) : (s2.x < 0) ? make_uchar4(255.0f, 0.0f, 0.0f, 0.0f) : make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f);
+			if (nl < 1.0f && s2.x >= 0) colour(x,y) = make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f);
+		}
+	}
+}
+
+void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<short2> &screen1, TextureObject<short2> &screen2, cudaStream_t stream) {
+	const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	show_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen1, screen2);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+
+// ==== Show depth adjustments =================================================
+
+__global__ void show_depth_adjust_kernel(
+		TextureObject<uchar4> colour,
+		TextureObject<float> adjust) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < colour.width() && y < colour.height()) {
+		float a = adjust.tex2D(x,y);
+
+		if (a != 0.0f) {
+			float nc = min(1.0f, fabsf(a)/0.04f);
+			colour(x,y) = make_uchar4(0.0f, (1.0f-nc)*255.0f, nc*255.0f, 0.0f);
+		}
+	}
+}
+
+void ftl::cuda::show_depth_adjustment(TextureObject<uchar4> &colour, TextureObject<float> &adjust, cudaStream_t stream) {
+	const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	show_depth_adjust_kernel<<<gridSize, blockSize, 0, stream>>>(colour, adjust);
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/fusion/mls_aggr.cu b/components/operators/src/fusion/mls_aggr.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9332117bc11f382f418151c5305f2d694cf3e7e0
--- /dev/null
+++ b/components/operators/src/fusion/mls_aggr.cu
@@ -0,0 +1,341 @@
+#include "mvmls_cuda.hpp"
+#include <ftl/cuda/weighting.hpp>
+#include <ftl/operators/mask_cuda.hpp>
+#include <ftl/cuda/warp.hpp>
+
+using ftl::cuda::TextureObject;
+using ftl::rgbd::Camera;
+using ftl::cuda::Mask;
+using ftl::cuda::MvMLSParams;
+
+#define T_PER_BLOCK 8
+#define WARP_SIZE 32
+
+#include "correspondence_common.hpp"
+
+
+// ==== MultiViewMLS Aggregate =================================================
+
+__device__ inline short3 getScreenPos(int x, int y, float d, const Camera &cam1, const Camera &cam2, const float4x4 &transform) {
+    const float3 campos = transform * cam1.screenToCam(x,y,d);
+    const int2 screen = cam2.camToScreen<int2>(campos);
+    return make_short3(screen.x, screen.y, campos.z);
+}
+
+__device__ inline short2 packScreen(int x, int y, int id) {
+    return make_short2((id << 12) + x, y);
+}
+
+__device__ inline short2 packScreen(const short3 &p, int id) {
+    return make_short2((id << 12) + p.x, p.y);
+}
+
+__device__ inline int supportSize(uchar4 support) {
+    return (support.x+support.y) * (support.z+support.w);
+}
+
+__device__ inline short2 choosePoint(uchar4 sup1, uchar4 sup2, float dot1, float dot2, short2 screen1, short2 screen2) {
+    //return (float(supportSize(sup2))*dot1 > float(supportSize(sup1))*dot2) ? screen2 : screen1;
+    return (dot1 > dot2) ? screen2 : screen1;
+}
+
+__device__ inline int unpackCameraID(short2 p) {
+    return p.x >> 12;
+}
+
+/**
+ * Identify which source has the best support region for a given pixel.
+ */
+__global__ void best_sources_kernel(
+        TextureObject<half4> normals1,
+        TextureObject<half4> normals2,
+        TextureObject<uchar4> support1,
+        TextureObject<uchar4> support2,
+        TextureObject<float> depth1,
+        TextureObject<float> depth2,
+        TextureObject<short2> screen,
+        float4x4 transform,
+        //float3x3 transformR,
+        ftl::rgbd::Camera cam1,
+        ftl::rgbd::Camera cam2,
+        int id1,
+        int id2) {
+
+    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x >= 0 && y >= 0 && x < screen.width() && y < screen.height()) {
+        const float d1 = depth1.tex2D(x,y);
+
+        const short3 scr2 = getScreenPos(x, y, d1, cam1, cam2, transform);
+        short2 bestPoint = packScreen(x,y,0);
+
+        if (scr2.x >= 0 && scr2.y >= 0 && scr2.x < cam2.width && scr2.y < cam2.height) {
+            uchar4 sup1 = support1.tex2D(x,y);
+            uchar4 sup2 = support2.tex2D(scr2.x,scr2.y);
+            const float d2 = depth2.tex2D(scr2.x,scr2.y);
+            float3 n1 = transform.getFloat3x3() * make_float3(normals1.tex2D(x,y));
+            float3 n2 = make_float3(normals2.tex2D(scr2.x,scr2.y));
+
+            float3 camray = cam2.screenToCam(scr2.x,scr2.y,1.0f);
+            camray /= length(camray);
+            const float dot1 = dot(camray, n1);
+            const float dot2 = dot(camray, n2);
+
+            bestPoint = (fabs(scr2.z - d2) < 0.04f) ? choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2)) : packScreen(x,y,6);
+            //bestPoint = choosePoint(sup1, sup2, dot1, dot2, packScreen(x,y,id1), packScreen(scr2,id2));
+			//bestPoint = (d1 < d2) ? packScreen(x,y,id1) : packScreen(x,y,id2);
+			
+			bestPoint = (fabs(scr2.z - d2) < 0.04f) ? packScreen(scr2,id2) : packScreen(scr2,id1);
+        }
+
+        screen(x,y) = bestPoint;
+
+        /*if (s.x >= 0 && s.y >= 0) {
+            auto norm1 = make_float3(n1.tex2D(x,y));
+            const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
+            //n2(s.x,s.y) = norm1;
+
+            float3 cent1 = make_float3(c1.tex2D(x,y));
+            const auto cent2 = make_float3(c2.tex2D(s.x,s.y));
+
+            if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f) {
+                norm1 += poseInv1.getFloat3x3() * (pose2.getFloat3x3() * norm2);
+                n1(x,y) = make_float4(norm1, 0.0f);
+				cent1 +=  poseInv1 * (pose2 * cent2);  // FIXME: Transform between camera spaces
+				cent1 /= 2.0f;
+                c1(x,y) = make_float4(cent1, 0.0f);
+                //c2(s.x,s.y) = cent1;
+
+				//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
+            }
+           // contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
+        }*/
+    }
+}
+
+void ftl::cuda::best_sources(
+        ftl::cuda::TextureObject<half4> &normals1,
+        ftl::cuda::TextureObject<half4> &normals2,
+        ftl::cuda::TextureObject<uchar4> &support1,
+        ftl::cuda::TextureObject<uchar4> &support2,
+        ftl::cuda::TextureObject<float> &depth1,
+        ftl::cuda::TextureObject<float> &depth2,
+        ftl::cuda::TextureObject<short2> &screen,
+        const float4x4 &transform,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2,
+        int id1,
+        int id2,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(normals1, normals2, support1, support2, depth1, depth2, screen, transform, cam1, cam2, id1, id2);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+/**
+ * Identify which source has the best support region for a given pixel.
+ */
+ __global__ void aggregate_sources_kernel(
+		TextureObject<half4> n1,
+		TextureObject<half4> n2,
+		TextureObject<float4> c1,
+		TextureObject<float4> c2,
+		TextureObject<float> depth1,
+		//TextureObject<float> depth2,
+		//TextureObject<short2> screen,
+		float4x4 transform,
+		//float3x3 transformR,
+		ftl::rgbd::Camera cam1,
+		ftl::rgbd::Camera cam2) {
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x >= 0 && y >= 0 && x < n1.width() && y < n1.height()) {
+		const float d1 = depth1.tex2D(x,y);
+
+		if (d1 > cam1.minDepth && d1 < cam1.maxDepth) {
+			//const short3 s = getScreenPos(x, y, d1, cam1, cam2, transform);
+			const float3 camPos = transform * cam1.screenToCam(x, y, d1);
+			const int2 s = cam2.camToScreen<int2>(camPos);
+
+			if (s.x >= 0 && s.y >= 0 && s.x < n2.width() && s.y < n2.height()) {
+				auto norm1 = make_float3(n1.tex2D(x,y));
+				const auto norm2 = make_float3(n2.tex2D(s.x,s.y));
+				//n2(s.x,s.y) = norm1;
+
+				float3 cent1 = make_float3(c1.tex2D(x,y));
+				const auto cent2 = transform.getInverse() * make_float3(c2.tex2D(s.x,s.y));
+
+				//printf("MERGING %f\n", length(cent2-cent1));
+
+				if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f && length(cent2-cent1) < 0.04f) {
+					norm1 += norm2;
+					norm1 /= 2.0f;
+					n1(x,y) = make_half4(norm1, 0.0f);
+					cent1 += cent2;
+					cent1 /= 2.0f;
+					c1(x,y) = make_float4(cent1, 0.0f);
+					//c2(s.x,s.y) = cent1;
+
+					//contribs1(x,y) = contribs1.tex2D(x,y) + 1.0f;
+				}
+			// contribs2(s.x,s.y) = contribs2.tex2D(s.x,s.y) + 1.0f;
+			}
+		}
+	}
+}
+
+void ftl::cuda::aggregate_sources(
+		ftl::cuda::TextureObject<half4> &n1,
+		ftl::cuda::TextureObject<half4> &n2,
+		ftl::cuda::TextureObject<float4> &c1,
+		ftl::cuda::TextureObject<float4> &c2,
+		ftl::cuda::TextureObject<float> &depth1,
+		//ftl::cuda::TextureObject<float> &depth2,
+		//ftl::cuda::TextureObject<short2> &screen,
+		const float4x4 &transform,
+		const ftl::rgbd::Camera &cam1,
+		const ftl::rgbd::Camera &cam2,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((n1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (n1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, depth1, transform, cam1, cam2);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+__device__ static uchar4 HSVtoRGB(int H, float S, float V) {
+	const float C = S * V;
+	const float X = C * (1 - fabs(fmodf(H / 60.0f, 2) - 1));
+	const float m = V - C;
+	float Rs, Gs, Bs;
+
+	if(H >= 0 && H < 60) {
+		Rs = C;
+		Gs = X;
+		Bs = 0;	
+	}
+	else if(H >= 60 && H < 120) {	
+		Rs = X;
+		Gs = C;
+		Bs = 0;	
+	}
+	else if(H >= 120 && H < 180) {
+		Rs = 0;
+		Gs = C;
+		Bs = X;	
+	}
+	else if(H >= 180 && H < 240) {
+		Rs = 0;
+		Gs = X;
+		Bs = C;	
+	}
+	else if(H >= 240 && H < 300) {
+		Rs = X;
+		Gs = 0;
+		Bs = C;	
+	}
+	else {
+		Rs = C;
+		Gs = 0;
+		Bs = X;	
+	}
+
+	return make_uchar4((Bs + m) * 255, (Gs + m) * 255, (Rs + m) * 255, 0);
+}
+
+/**
+ * Render each pixel is a colour corresponding to the source camera with the
+ * best support window.
+ */
+ __global__ void vis_best_sources_kernel(
+        TextureObject<short2> screen,
+        TextureObject<uchar4> colour,
+        int myid,
+        int count) {
+
+    const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+    const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x >= 0 && y >= 0 && x < colour.width() && y < colour.height()) {
+        short2 s = screen.tex2D(x,y);
+        int id = unpackCameraID(s);
+
+        uchar4 c = HSVtoRGB((360 / count) * id, 0.6f, 0.85f);
+        if (myid != id) colour(x,y) = c;
+        //colour(x,y) = c;
+    }
+}
+
+void ftl::cuda::vis_best_sources(
+        ftl::cuda::TextureObject<short2> &screen,
+        ftl::cuda::TextureObject<uchar4> &colour,
+        int myid,
+        int count,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    vis_best_sources_kernel<<<gridSize, blockSize, 0, stream>>>(screen, colour, myid, count);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+/*void ftl::cuda::aggregate_sources(
+        ftl::cuda::TextureObject<float4> &n1,
+        ftl::cuda::TextureObject<float4> &n2,
+        ftl::cuda::TextureObject<float4> &c1,
+        ftl::cuda::TextureObject<float4> &c2,
+        ftl::cuda::TextureObject<float> &contribs1,
+        ftl::cuda::TextureObject<float> &contribs2,
+		ftl::cuda::TextureObject<short2> &screen,
+		const float4x4 &poseInv1,
+		const float4x4 &pose2,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((screen.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    aggregate_sources_kernel<<<gridSize, blockSize, 0, stream>>>(n1, n2, c1, c2, contribs1, contribs2, screen, poseInv1, pose2);
+    cudaSafeCall( cudaGetLastError() );
+}*/
+
+// ==== Normalise aggregations =================================================
+
+__global__ void normalise_aggregations_kernel(
+        TextureObject<half4> norms,
+        TextureObject<float4> cents,
+        TextureObject<float> contribs) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < norms.width() && y < norms.height()) {
+        const float contrib = contribs.tex2D((int)x,(int)y);
+
+        const auto a = make_float3(norms.tex2D((int)x,(int)y));
+        const auto b = cents.tex2D(x,y);
+        //const float4 normal = normals.tex2D((int)x,(int)y);
+
+		//out(x,y) = (contrib == 0.0f) ? make<B>(a) : make<B>(a / contrib);
+
+        if (contrib > 0.0f) {
+            norms(x,y) = make_half4(a / (contrib+1.0f), 1.0f);
+            cents(x,y) = b / (contrib+1.0f);
+        }
+    }
+}
+
+void ftl::cuda::normalise_aggregations(TextureObject<half4> &norms, TextureObject<float4> &cents, TextureObject<float> &contribs, cudaStream_t stream) {
+    const dim3 gridSize((norms.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norms.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    normalise_aggregations_kernel<<<gridSize, blockSize, 0, stream>>>(norms, cents, contribs);
+    cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/fusion/mvmls.cpp b/components/operators/src/fusion/mvmls.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d7ef03d9701ecedd906cd999d01ef3ce02e4231
--- /dev/null
+++ b/components/operators/src/fusion/mvmls.cpp
@@ -0,0 +1,409 @@
+#include <ftl/operators/mvmls.hpp>
+#include "smoothing_cuda.hpp"
+#include <ftl/utility/matrix_conversion.hpp>
+#include "mvmls_cuda.hpp"
+
+#include <opencv2/cudaarithm.hpp>
+
+using ftl::operators::MultiViewMLS;
+using ftl::codecs::Channel;
+using cv::cuda::GpuMat;
+using ftl::rgbd::Format;
+
+MultiViewMLS::MultiViewMLS(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {
+
+}
+
+MultiViewMLS::~MultiViewMLS() {
+
+}
+
+bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) {
+    cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+   // float thresh = config()->value("mls_threshold", 0.4f);
+	float disconPixels = config()->value("discon_pixels", 100.0f);  // Max before definitely not same surface
+	
+	float col_smooth = config()->value("mls_colour_smoothing", 30.0f);
+	int iters = config()->value("mls_iterations", 3);
+	int radius = config()->value("mls_radius",5);
+	//bool aggre = config()->value("aggregation", true);
+    //int win = config()->value("cost_function",1);
+    int win = config()->value("window_size",16);
+    bool do_corr = config()->value("merge_corresponding", true);
+	bool do_aggr = config()->value("merge_mls", false);
+	bool cull_zero = config()->value("cull_no_confidence", false);
+    //bool show_best_source = config()->value("show_pixel_source", false);
+
+    ftl::cuda::MvMLSParams params;
+    params.colour_smooth = config()->value("colour_smooth", 150.0f);
+    params.spatial_smooth = config()->value("spatial_smooth", 1.0f);
+
+	bool show_consistency = config()->value("show_consistency", false);
+	bool show_adjustment = config()->value("show_adjustment", false);
+
+    if (in.frames.size() < 1) return false;
+    auto size = in.frames[0].get<GpuMat>(Channel::Depth).size();
+
+    // Make sure we have enough buffers
+    while (normals_horiz_.size() < in.frames.size()) {
+        normals_horiz_.push_back(new ftl::cuda::TextureObject<half4>(size.height, size.width));
+        centroid_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width));
+        centroid_vert_.push_back(new ftl::cuda::TextureObject<float4>(size.width, size.height));
+        contributions_.push_back(new ftl::cuda::TextureObject<float>(size.width, size.height));
+    }
+
+    // Make sure all buffers are at correct resolution and are allocated
+    for (size_t i=0; i<in.frames.size(); ++i) {
+        auto &f = in.frames[i];
+	    auto size = f.get<GpuMat>(Channel::Depth).size();
+	    centroid_horiz_[i]->create(size.height, size.width);
+	    normals_horiz_[i]->create(size.height, size.width);
+	    centroid_vert_[i]->create(size.width, size.height);
+        contributions_[i]->create(size.width, size.height);
+
+        if (!f.hasChannel(Channel::Normals)) {
+            throw FTL_Error("Required normals channel missing for MLS");
+        }
+        if (!f.hasChannel(Channel::Support2) && !f.hasChannel(Channel::Support1)) {
+            throw FTL_Error("Required cross support channel missing for MLS");
+        }
+
+        // Create required channels
+        f.create<GpuMat>(Channel::Confidence, Format<float>(size));
+        f.createTexture<float>(Channel::Confidence);
+        f.create<GpuMat>(Channel::Screen, Format<short2>(size));
+        f.createTexture<short2>(Channel::Screen);
+
+        f.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+    }
+
+    //for (int iter=0; iter<iters; ++iter) {
+		// Step 1:
+		// Calculate correspondences and adjust depth values
+		// Step 2:
+        // Find corresponding points and perform aggregation of any correspondences
+        // For each camera combination
+        if (do_corr) {
+            for (size_t i=0; i<in.frames.size(); ++i) {
+                auto &f1 = in.frames[i];
+                //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+                //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+                Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+                d1 = f1.getPose() * d1;
+
+                for (size_t j=i+1; j<in.frames.size(); ++j) {
+                    if (i == j) continue;
+
+                    //LOG(INFO) << "Running phase1";
+
+                    auto &f2 = in.frames[j];
+                    //auto s1 = in.sources[i];
+                    //auto s2 = in.sources[j];
+
+                    // Are cameras facing similar enough direction?
+                    Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+                    d2 = f2.getPose() * d2;
+                    // No, so skip this combination
+                    if (d1.dot(d2) <= 0.0) continue;
+
+                    auto pose = MatrixConversion::toCUDA(f1.getPose().cast<float>().inverse() * f2.getPose().cast<float>());
+                    auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>().inverse() * f1.getPose().cast<float>());
+
+                    //auto transform = pose2 * pose1;
+
+                    //Calculate screen positions of estimated corresponding points
+                    //if (iter == 0) {
+                        ftl::cuda::correspondence(
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<short2>(Channel::Screen),
+                            f1.getTexture<float>(Channel::Confidence),
+                            f1.getTexture<uint8_t>(Channel::Mask),
+                            pose2,
+                            f1.getLeftCamera(),
+                            f2.getLeftCamera(),
+                            params,
+                            win,
+                            stream
+                        );
+                        ftl::cuda::correspondence(
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<short2>(Channel::Screen),
+                            f2.getTexture<float>(Channel::Confidence),
+                            f2.getTexture<uint8_t>(Channel::Mask),
+                            pose,
+                            f2.getLeftCamera(),
+                            f1.getLeftCamera(),
+                            params,
+                            win,
+                            stream
+                        );
+
+						if (show_consistency) {
+							ftl::cuda::show_cor_error(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f2.getTexture<short2>(Channel::Screen), stream);
+							ftl::cuda::show_cor_error(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f1.getTexture<short2>(Channel::Screen), stream);
+						}
+
+						if (show_adjustment) {
+							ftl::cuda::show_depth_adjustment(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<float>(Channel::Confidence), stream);
+							ftl::cuda::show_depth_adjustment(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<float>(Channel::Confidence), stream);
+						}
+
+						// Actually perform the adjustment
+						cv::cuda::add(f1.get<GpuMat>(Channel::Depth), f1.get<GpuMat>(Channel::Confidence), f1.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
+						cv::cuda::add(f2.get<GpuMat>(Channel::Depth), f2.get<GpuMat>(Channel::Confidence), f2.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
+                    //} //else {
+                        /*ftl::cuda::correspondence(
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<uchar4>(Channel::Colour),
+                            f2.getTexture<uchar4>(Channel::Colour),
+                            // TODO: Add normals and other things...
+                            f1.getTexture<short2>(Channel::Screen),
+                            f1.getTexture<float>(Channel::Confidence),
+                            f1.getTexture<uint8_t>(Channel::Mask),
+                            pose2,
+                            f1.getLeftCamera(),
+                            f2.getLeftCamera(),
+                            params,
+                            win,
+                            stream
+                        );
+                        ftl::cuda::correspondence(
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<uchar4>(Channel::Colour),
+                            f1.getTexture<uchar4>(Channel::Colour),
+                            // TODO: Add normals and other things...
+                            f2.getTexture<short2>(Channel::Screen),
+                            f2.getTexture<float>(Channel::Confidence),
+                            f2.getTexture<uint8_t>(Channel::Mask),
+                            pose,
+                            f2.getLeftCamera(),
+                            f1.getLeftCamera(),
+                            params,
+                            win,
+                            stream
+                        );*/
+                    //}
+
+                    // Also calculate best source for each point
+                    /*ftl::cuda::best_sources(
+                        f1.getTexture<uchar4>(Channel::Support1),
+                        f2.getTexture<uchar4>(Channel::Support1),
+                        f1.getTexture<float>(Channel::Depth),
+                        f2.getTexture<float>(Channel::Depth),
+                        f1.getTexture<short2>(Channel::Screen),
+                        transform,
+                        s1->parameters(),
+                        s2->parameters(),
+                        i, j, stream
+                    );*/
+				}
+			}
+
+            // Reduce window size for next iteration
+            //win = max(win>>1, 4);
+		}
+
+	for (int iter=0; iter<iters; ++iter) {	
+
+        // Find best source for every pixel
+        /*for (size_t i=0; i<in.frames.size(); ++i) {
+            auto &f1 = in.frames[i];
+            //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+            //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+            Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+            d1 = in.sources[i]->getPose() * d1;
+
+            for (size_t j=0; j<in.frames.size(); ++j) {
+                if (i == j) continue;
+
+                //LOG(INFO) << "Running phase1";
+
+                auto &f2 = in.frames[j];
+                auto s1 = in.sources[i];
+                auto s2 = in.sources[j];
+
+                // Are cameras facing similar enough direction?
+                Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+                d2 = in.sources[j]->getPose() * d2;
+                // No, so skip this combination
+                if (d1.dot(d2) <= 0.0) continue;
+
+                auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
+                auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
+                auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
+                auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
+
+                auto transform = pose2 * pose1;
+
+                // Also calculate best source for each point
+                ftl::cuda::best_sources(
+                    f1.getTexture<float4>(Channel::Normals),
+                    f2.getTexture<float4>(Channel::Normals),
+                    f1.getTexture<uchar4>(Channel::Support1),
+                    f2.getTexture<uchar4>(Channel::Support1),
+                    f1.getTexture<float>(Channel::Depth),
+                    f2.getTexture<float>(Channel::Depth),
+                    f1.getTexture<short2>(Channel::Screen),
+                    transform,
+                    s1->parameters(),
+                    s2->parameters(),
+                    i, j, stream
+                );
+            }
+        }*/
+
+		if (radius > 0) {
+			// Step 2:
+			// Do the horizontal and vertical MLS aggregations for each source
+			// But don't do the final move step.
+			for (size_t i=0; i<in.frames.size(); ++i) {
+				auto &f = in.frames[i];
+				//auto *s = in.sources[i];
+
+				// Clear data
+				//cv::cuda::GpuMat data(contributions_[i]->height(), contributions_[i]->width(), CV_32F, contributions_[i]->devicePtr(), contributions_[i]->pixelPitch());
+				//data.setTo(cv::Scalar(0.0f), cvstream);
+
+				if (cull_zero && iter == iters-1) {
+					ftl::cuda::zero_confidence(
+						f.getTexture<float>(Channel::Confidence),
+						f.getTexture<float>(Channel::Depth),
+						stream
+					);
+				}
+
+				float thresh = (1.0f / f.getLeft().fx) * disconPixels;
+
+				ftl::cuda::mls_aggr_horiz(
+					f.createTexture<uchar4>((f.hasChannel(Channel::Support2)) ? Channel::Support2 : Channel::Support1),
+					f.createTexture<half4>(Channel::Normals),
+					*normals_horiz_[i],
+					f.createTexture<float>(Channel::Depth),
+					*centroid_horiz_[i],
+					f.createTexture<uchar4>(Channel::Colour),
+					thresh,
+					col_smooth,
+					radius,
+					f.getLeftCamera(),
+					stream
+				);
+
+				ftl::cuda::mls_aggr_vert(
+					f.getTexture<uchar4>((f.hasChannel(Channel::Support2)) ? Channel::Support2 : Channel::Support1),
+					*normals_horiz_[i],
+					f.getTexture<half4>(Channel::Normals),
+					*centroid_horiz_[i],
+					*centroid_vert_[i],
+					thresh,
+					col_smooth,
+					radius,
+					f.getLeftCamera(),
+					stream
+				);
+			}
+
+			//return true;
+
+
+			// Step 3:
+			// Find corresponding points and perform aggregation of any correspondences
+			// For each camera combination
+			if (do_aggr) {
+				for (size_t i=0; i<in.frames.size(); ++i) {
+					auto &f1 = in.frames[i];
+					//f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
+					//f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+
+					Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
+					d1 = f1.getPose() * d1;
+
+					for (size_t j=0; j<in.frames.size(); ++j) {
+						if (i == j) continue;
+
+						//LOG(INFO) << "Running phase1";
+
+						auto &f2 = in.frames[j];
+						//auto s1 = in.sources[i];
+						//auto s2 = in.sources[j];
+
+						// Are cameras facing similar enough direction?
+						Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
+						d2 = f2.getPose() * d2;
+						// No, so skip this combination
+						if (d1.dot(d2) <= 0.0) continue;
+
+						auto pose1 = MatrixConversion::toCUDA(f1.getPose().cast<float>());
+						//auto pose1_inv = MatrixConversion::toCUDA(f1.getPose().cast<float>().inverse());
+						auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>().inverse());
+						//auto pose2_inv = MatrixConversion::toCUDA(f2.getPose().cast<float>());
+
+						auto transform = pose2 * pose1;
+
+						// For the corresponding points, combine normals and centroids
+						ftl::cuda::aggregate_sources(
+							f1.getTexture<half4>(Channel::Normals),
+							f2.getTexture<half4>(Channel::Normals),
+							*centroid_vert_[i],
+							*centroid_vert_[j],
+							f1.getTexture<float>(Channel::Depth),
+							//contributions_[i],
+							//contributions_[j],
+							//f1.getTexture<short2>(Channel::Screen),
+							transform,
+							f1.getLeftCamera(),
+							f2.getLeftCamera(),
+							stream
+						);
+
+						//LOG(INFO) << "Correspondences done... " << i;
+					}
+				}
+			}
+
+			// Step 3:
+			// Normalise aggregations and move the points
+			for (size_t i=0; i<in.frames.size(); ++i) {
+				auto &f = in.frames[i];
+				//auto *s = in.sources[i];
+				auto size = f.get<GpuMat>(Channel::Depth).size();
+
+				/*if (do_corr) {
+					ftl::cuda::normalise_aggregations(
+						f.getTexture<float4>(Channel::Normals),
+						centroid_vert_[i],
+						contributions_[i],
+						stream
+					);
+				}*/
+
+				ftl::cuda::mls_adjust_depth(
+					f.getTexture<half4>(Channel::Normals),
+					*centroid_vert_[i],
+					f.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)),
+					f.getLeftCamera(),
+					stream
+				);
+
+				f.swapChannels(Channel::Depth, Channel::Depth2);
+
+				/*if (show_best_source) {
+					ftl::cuda::vis_best_sources(
+						f.getTexture<short2>(Channel::Screen),
+						f.getTexture<uchar4>(Channel::Colour),
+						i,
+						7, stream
+					);
+				}*/
+			}
+		}
+    }
+
+    return true;
+}
diff --git a/components/operators/src/mvmls_cuda.hpp b/components/operators/src/fusion/mvmls_cuda.hpp
similarity index 78%
rename from components/operators/src/mvmls_cuda.hpp
rename to components/operators/src/fusion/mvmls_cuda.hpp
index 49dd0c1db15e45606f1fe791879c05cf75dfe1f1..4ee589e0d1e5af86c50aab2bd83bee374f3b5048 100644
--- a/components/operators/src/mvmls_cuda.hpp
+++ b/components/operators/src/fusion/mvmls_cuda.hpp
@@ -11,13 +11,6 @@ namespace cuda {
 struct MvMLSParams {
     float spatial_smooth;
     float colour_smooth;
-	float fill_match;
-	float fill_threshold;
-	float match_threshold;
-    float cost_ratio;
-    float cost_threshold;
-	float range;
-    uint flags;
 };
 
 void correspondence(
@@ -33,6 +26,17 @@ void correspondence(
         const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
         cudaStream_t stream);
 
+void correspondence(
+        ftl::cuda::TextureObject<float> &d1,
+        ftl::cuda::TextureObject<float> &d2,
+        ftl::cuda::TextureObject<short2> &screen,
+		ftl::cuda::TextureObject<float> &conf,
+		ftl::cuda::TextureObject<uint8_t> &mask,
+        float4x4 &pose,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
+        cudaStream_t stream);
+
 void zero_confidence(
 		ftl::cuda::TextureObject<float> &conf,
 		ftl::cuda::TextureObject<float> &depth,
@@ -93,6 +97,17 @@ void normalise_aggregations(
     ftl::cuda::TextureObject<float> &contribs,
     cudaStream_t stream);
 
+void show_cor_error(
+	ftl::cuda::TextureObject<uchar4> &colour,
+	ftl::cuda::TextureObject<short2> &screen1,
+	ftl::cuda::TextureObject<short2> &screen2,
+	cudaStream_t stream);
+
+void show_depth_adjustment(
+	ftl::cuda::TextureObject<uchar4> &colour,
+	ftl::cuda::TextureObject<float> &adjust,
+	cudaStream_t stream);
+
 }
 }
 
diff --git a/components/operators/src/mask.cpp b/components/operators/src/mask.cpp
index 302b1b39555401cc4bf83091a44def76d91fa6ab..beec2bd285f496a2d91f2fbbe6fbbbd39ffce090 100644
--- a/components/operators/src/mask.cpp
+++ b/components/operators/src/mask.cpp
@@ -62,12 +62,14 @@ bool CullDiscontinuity::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaS
 
 	uint8_t maskID = config()->value("mask_id", (unsigned int)ftl::cuda::Mask::kMask_Discontinuity);
 	unsigned int radius = config()->value("radius", 2);
+	bool inverted = config()->value("invert", false);
 	
 	out.clearPackets(Channel::Depth);  // Force reset
 	ftl::cuda::cull_mask(
 		in.createTexture<uint8_t>(Channel::Mask),
 		out.createTexture<float>(Channel::Depth),
 		maskID,
+		inverted,
 		radius,
 		stream
 	);
diff --git a/components/operators/src/mask.cu b/components/operators/src/mask.cu
index 4975bcd069f204d3ec1015d8af871e1ad861569f..7c7ce32ad2775433dcd239bf0024ee36185b4d3a 100644
--- a/components/operators/src/mask.cu
+++ b/components/operators/src/mask.cu
@@ -68,7 +68,7 @@ void ftl::cuda::discontinuity(	ftl::cuda::TextureObject<uint8_t> &mask_out, ftl:
 
 // =============================================================================
 
-template <int RADIUS>
+template <int RADIUS, bool INVERT>
 __global__ void cull_mask_kernel(ftl::cuda::TextureObject<uint8_t> mask, ftl::cuda::TextureObject<float> depth, uint8_t id) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -85,24 +85,36 @@ __global__ void cull_mask_kernel(ftl::cuda::TextureObject<uint8_t> mask, ftl::cu
 		}
 		}
 
-		if (isdiscon) {
+		if ((!INVERT && isdiscon) || (INVERT && !isdiscon)) {
 			depth(x,y) = 0.0f;
 		}
 	}
 }
 
-void ftl::cuda::cull_mask(ftl::cuda::TextureObject<uint8_t> &mask, ftl::cuda::TextureObject<float> &depth, uint8_t id, unsigned int radius, cudaStream_t stream) {
+void ftl::cuda::cull_mask(ftl::cuda::TextureObject<uint8_t> &mask, ftl::cuda::TextureObject<float> &depth, uint8_t id, bool invert, unsigned int radius, cudaStream_t stream) {
 	const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	switch (radius) {
-	case 0	: cull_mask_kernel<0><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	case 1	: cull_mask_kernel<1><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	case 2	: cull_mask_kernel<2><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	case 3	: cull_mask_kernel<3><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	case 4	: cull_mask_kernel<4><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	case 5	: cull_mask_kernel<5><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
-	default: break;
+	if (invert) {
+		switch (radius) {
+		case 0	: cull_mask_kernel<0,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 1	: cull_mask_kernel<1,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 2	: cull_mask_kernel<2,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 3	: cull_mask_kernel<3,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 4	: cull_mask_kernel<4,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 5	: cull_mask_kernel<5,true><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		default: break;
+		}
+	} else {
+		switch (radius) {
+		case 0	: cull_mask_kernel<0,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 1	: cull_mask_kernel<1,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 2	: cull_mask_kernel<2,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 3	: cull_mask_kernel<3,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 4	: cull_mask_kernel<4,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		case 5	: cull_mask_kernel<5,false><<<gridSize, blockSize, 0, stream>>>(mask, depth, id); break;
+		default: break;
+		}
 	}
 	cudaSafeCall( cudaGetLastError() );
 
diff --git a/components/operators/src/mvmls.cpp b/components/operators/src/mvmls.cpp
deleted file mode 100644
index 8a74b7a95f92103e55d5bce371ae38886353c1f2..0000000000000000000000000000000000000000
--- a/components/operators/src/mvmls.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-#include <ftl/operators/mvmls.hpp>
-#include "smoothing_cuda.hpp"
-#include <ftl/utility/matrix_conversion.hpp>
-#include "mvmls_cuda.hpp"
-
-using ftl::operators::MultiViewMLS;
-using ftl::codecs::Channel;
-using cv::cuda::GpuMat;
-using ftl::rgbd::Format;
-
-MultiViewMLS::MultiViewMLS(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {
-
-}
-
-MultiViewMLS::~MultiViewMLS() {
-
-}
-
-bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) {
-    cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
-
-   // float thresh = config()->value("mls_threshold", 0.4f);
-	float disconPixels = config()->value("discon_pixels", 100.0f);  // Max before definitely not same surface
-	
-	float col_smooth = config()->value("mls_colour_smoothing", 30.0f);
-	int iters = config()->value("mls_iterations", 3);
-	int radius = config()->value("mls_radius",5);
-	//bool aggre = config()->value("aggregation", true);
-    //int win = config()->value("cost_function",1);
-    int win = config()->value("window_size",16);
-    bool do_corr = config()->value("merge_corresponding", true);
-	bool do_aggr = config()->value("merge_mls", false);
-	bool cull_zero = config()->value("cull_no_confidence", false);
-    //bool show_best_source = config()->value("show_pixel_source", false);
-
-    ftl::cuda::MvMLSParams params;
-    params.range = config()->value("search_range", 0.05f);
-    params.fill_match = config()->value("fill_match", 50.0f);
-    params.fill_threshold = config()->value("fill_threshold", 0.0f);
-	params.match_threshold = config()->value("match_threshold", 0.3f);
-    params.colour_smooth = config()->value("colour_smooth", 150.0f);
-    //params.spatial_smooth = config()->value("spatial_smooth", 0.04f);
-    params.cost_ratio = config()->value("cost_ratio", 0.2f);
-	params.cost_threshold = config()->value("cost_threshold", 1.0f);
-
-    if (in.frames.size() < 1) return false;
-    auto size = in.frames[0].get<GpuMat>(Channel::Depth).size();
-
-    // Make sure we have enough buffers
-    while (normals_horiz_.size() < in.frames.size()) {
-        normals_horiz_.push_back(new ftl::cuda::TextureObject<half4>(size.height, size.width));
-        centroid_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width));
-        centroid_vert_.push_back(new ftl::cuda::TextureObject<float4>(size.width, size.height));
-        contributions_.push_back(new ftl::cuda::TextureObject<float>(size.width, size.height));
-    }
-
-    // Make sure all buffers are at correct resolution and are allocated
-    for (size_t i=0; i<in.frames.size(); ++i) {
-        auto &f = in.frames[i];
-	    auto size = f.get<GpuMat>(Channel::Depth).size();
-	    centroid_horiz_[i]->create(size.height, size.width);
-	    normals_horiz_[i]->create(size.height, size.width);
-	    centroid_vert_[i]->create(size.width, size.height);
-        contributions_[i]->create(size.width, size.height);
-
-        if (!f.hasChannel(Channel::Normals)) {
-            throw FTL_Error("Required normals channel missing for MLS");
-        }
-        if (!f.hasChannel(Channel::Support2)) {
-            throw FTL_Error("Required cross support channel missing for MLS");
-        }
-
-        // Create required channels
-        f.create<GpuMat>(Channel::Confidence, Format<float>(size));
-        f.createTexture<float>(Channel::Confidence);
-        f.create<GpuMat>(Channel::Screen, Format<short2>(size));
-        f.createTexture<short2>(Channel::Screen);
-
-        f.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
-    }
-
-    for (int iter=0; iter<iters; ++iter) {
-		// Step 1:
-		// Calculate correspondences and adjust depth values
-		// Step 2:
-        // Find corresponding points and perform aggregation of any correspondences
-        // For each camera combination
-        if (do_corr) {
-            for (size_t i=0; i<in.frames.size(); ++i) {
-                auto &f1 = in.frames[i];
-                //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
-                //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
-
-                Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
-                d1 = f1.getPose() * d1;
-
-                for (size_t j=0; j<in.frames.size(); ++j) {
-                    if (i == j) continue;
-
-                    //LOG(INFO) << "Running phase1";
-
-                    auto &f2 = in.frames[j];
-                    //auto s1 = in.sources[i];
-                    //auto s2 = in.sources[j];
-
-                    // Are cameras facing similar enough direction?
-                    Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
-                    d2 = f2.getPose() * d2;
-                    // No, so skip this combination
-                    if (d1.dot(d2) <= 0.0) continue;
-
-                    auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>().inverse() * f1.getPose().cast<float>());
-
-                    //auto transform = pose2 * pose1;
-
-                    //Calculate screen positions of estimated corresponding points
-                    ftl::cuda::correspondence(
-                        f1.getTexture<float>(Channel::Depth),
-                        f2.getTexture<float>(Channel::Depth),
-                        f1.getTexture<uchar4>(Channel::Colour),
-                        f2.getTexture<uchar4>(Channel::Colour),
-                        // TODO: Add normals and other things...
-                        f1.getTexture<short2>(Channel::Screen),
-                        f1.getTexture<float>(Channel::Confidence),
-                        f1.getTexture<uint8_t>(Channel::Mask),
-                        pose2,
-                        f1.getLeftCamera(),
-                        f2.getLeftCamera(),
-                        params,
-                        win,
-                        stream
-                    );
-
-                    // Also calculate best source for each point
-                    /*ftl::cuda::best_sources(
-                        f1.getTexture<uchar4>(Channel::Support1),
-                        f2.getTexture<uchar4>(Channel::Support1),
-                        f1.getTexture<float>(Channel::Depth),
-                        f2.getTexture<float>(Channel::Depth),
-                        f1.getTexture<short2>(Channel::Screen),
-                        transform,
-                        s1->parameters(),
-                        s2->parameters(),
-                        i, j, stream
-                    );*/
-				}
-			}
-
-            // Reduce window size for next iteration
-            win = max(win>>1, 4);
-		}
-
-        // Find best source for every pixel
-        /*for (size_t i=0; i<in.frames.size(); ++i) {
-            auto &f1 = in.frames[i];
-            //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
-            //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
-
-            Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
-            d1 = in.sources[i]->getPose() * d1;
-
-            for (size_t j=0; j<in.frames.size(); ++j) {
-                if (i == j) continue;
-
-                //LOG(INFO) << "Running phase1";
-
-                auto &f2 = in.frames[j];
-                auto s1 = in.sources[i];
-                auto s2 = in.sources[j];
-
-                // Are cameras facing similar enough direction?
-                Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
-                d2 = in.sources[j]->getPose() * d2;
-                // No, so skip this combination
-                if (d1.dot(d2) <= 0.0) continue;
-
-                auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
-                auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
-                auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
-                auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
-
-                auto transform = pose2 * pose1;
-
-                // Also calculate best source for each point
-                ftl::cuda::best_sources(
-                    f1.getTexture<float4>(Channel::Normals),
-                    f2.getTexture<float4>(Channel::Normals),
-                    f1.getTexture<uchar4>(Channel::Support1),
-                    f2.getTexture<uchar4>(Channel::Support1),
-                    f1.getTexture<float>(Channel::Depth),
-                    f2.getTexture<float>(Channel::Depth),
-                    f1.getTexture<short2>(Channel::Screen),
-                    transform,
-                    s1->parameters(),
-                    s2->parameters(),
-                    i, j, stream
-                );
-            }
-        }*/
-
-        // Step 2:
-        // Do the horizontal and vertical MLS aggregations for each source
-        // But don't do the final move step.
-        for (size_t i=0; i<in.frames.size(); ++i) {
-            auto &f = in.frames[i];
-            //auto *s = in.sources[i];
-
-            // Clear data
-            //cv::cuda::GpuMat data(contributions_[i]->height(), contributions_[i]->width(), CV_32F, contributions_[i]->devicePtr(), contributions_[i]->pixelPitch());
-            //data.setTo(cv::Scalar(0.0f), cvstream);
-
-			if (cull_zero && iter == iters-1) {
-				ftl::cuda::zero_confidence(
-					f.getTexture<float>(Channel::Confidence),
-					f.getTexture<float>(Channel::Depth),
-					stream
-				);
-			}
-
-			float thresh = (1.0f / f.getLeft().fx) * disconPixels;
-
-            ftl::cuda::mls_aggr_horiz(
-                f.createTexture<uchar4>(Channel::Support2),
-                f.createTexture<half4>(Channel::Normals),
-                *normals_horiz_[i],
-                f.createTexture<float>(Channel::Depth),
-                *centroid_horiz_[i],
-                f.createTexture<uchar4>(Channel::Colour),
-                thresh,
-                col_smooth,
-                radius,
-                f.getLeftCamera(),
-                stream
-            );
-
-            ftl::cuda::mls_aggr_vert(
-                f.getTexture<uchar4>(Channel::Support2),
-                *normals_horiz_[i],
-                f.getTexture<half4>(Channel::Normals),
-                *centroid_horiz_[i],
-                *centroid_vert_[i],
-                thresh,
-                col_smooth,
-                radius,
-                f.getLeftCamera(),
-                stream
-            );
-        }
-
-        //return true;
-
-
-        // Step 3:
-        // Find corresponding points and perform aggregation of any correspondences
-        // For each camera combination
-        if (do_aggr) {
-            for (size_t i=0; i<in.frames.size(); ++i) {
-                auto &f1 = in.frames[i];
-                //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
-                //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
-
-                Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
-                d1 = f1.getPose() * d1;
-
-                for (size_t j=0; j<in.frames.size(); ++j) {
-                    if (i == j) continue;
-
-                    //LOG(INFO) << "Running phase1";
-
-                    auto &f2 = in.frames[j];
-                    //auto s1 = in.sources[i];
-                    //auto s2 = in.sources[j];
-
-                    // Are cameras facing similar enough direction?
-                    Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0);
-                    d2 = f2.getPose() * d2;
-                    // No, so skip this combination
-                    if (d1.dot(d2) <= 0.0) continue;
-
-                    auto pose1 = MatrixConversion::toCUDA(f1.getPose().cast<float>());
-					//auto pose1_inv = MatrixConversion::toCUDA(f1.getPose().cast<float>().inverse());
-					auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>().inverse());
-					//auto pose2_inv = MatrixConversion::toCUDA(f2.getPose().cast<float>());
-
-					auto transform = pose2 * pose1;
-
-                    // For the corresponding points, combine normals and centroids
-                    ftl::cuda::aggregate_sources(
-                        f1.getTexture<half4>(Channel::Normals),
-                        f2.getTexture<half4>(Channel::Normals),
-                        *centroid_vert_[i],
-                        *centroid_vert_[j],
-						f1.getTexture<float>(Channel::Depth),
-                        //contributions_[i],
-                        //contributions_[j],
-                        //f1.getTexture<short2>(Channel::Screen),
-						transform,
-						f1.getLeftCamera(),
-						f2.getLeftCamera(),
-                        stream
-                    );
-
-                    //LOG(INFO) << "Correspondences done... " << i;
-                }
-            }
-        }
-
-        // Step 3:
-        // Normalise aggregations and move the points
-        for (size_t i=0; i<in.frames.size(); ++i) {
-            auto &f = in.frames[i];
-            //auto *s = in.sources[i];
-            auto size = f.get<GpuMat>(Channel::Depth).size();
-
-            /*if (do_corr) {
-                ftl::cuda::normalise_aggregations(
-                    f.getTexture<float4>(Channel::Normals),
-                    centroid_vert_[i],
-                    contributions_[i],
-                    stream
-                );
-            }*/
-
-            ftl::cuda::mls_adjust_depth(
-                f.getTexture<half4>(Channel::Normals),
-                *centroid_vert_[i],
-                f.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)),
-                f.getLeftCamera(),
-                stream
-            );
-
-            f.swapChannels(Channel::Depth, Channel::Depth2);
-
-            /*if (show_best_source) {
-                ftl::cuda::vis_best_sources(
-                    f.getTexture<short2>(Channel::Screen),
-                    f.getTexture<uchar4>(Channel::Colour),
-                    i,
-                    7, stream
-                );
-            }*/
-        }
-    }
-
-    return true;
-}
diff --git a/components/operators/src/weighting.cpp b/components/operators/src/weighting.cpp
index 520f136caa9d21e6821f8596f1b7e6d57ce3c9f0..928043d6f5d1f58506447f02ccd7cb2be10c54a2 100644
--- a/components/operators/src/weighting.cpp
+++ b/components/operators/src/weighting.cpp
@@ -31,18 +31,32 @@ bool PixelWeights::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream
 	params.colour = config()->value("use_colour", false);
 	params.noise = config()->value("use_noise", true);
 	params.normals = config()->value("use_normals", true);
+	bool output_normals = config()->value("output_normals", params.normals);
 
 	if (!in.hasChannel(Channel::Depth) || !in.hasChannel(Channel::Support1)) return false;
 
-	ftl::cuda::pixel_weighting(
-		out.createTexture<short>(Channel::Weights, ftl::rgbd::Format<short>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
-		out.createTexture<uint8_t>(Channel::Mask, ftl::rgbd::Format<uint8_t>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
-		in.createTexture<uchar4>(Channel::Support1),
-		in.createTexture<float>(Channel::Depth),
-		in.getLeftCamera(),
-		in.get<cv::cuda::GpuMat>(Channel::Depth).size(),
-		params, stream
-	);
+	if (output_normals) {
+		ftl::cuda::pixel_weighting(
+			out.createTexture<short>(Channel::Weights, ftl::rgbd::Format<short>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			out.createTexture<uint8_t>(Channel::Mask, ftl::rgbd::Format<uint8_t>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			out.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			in.createTexture<uchar4>(Channel::Support1),
+			in.createTexture<float>(Channel::Depth),
+			in.getLeftCamera(),
+			in.get<cv::cuda::GpuMat>(Channel::Depth).size(),
+			params, stream
+		);
+	} else {
+		ftl::cuda::pixel_weighting(
+			out.createTexture<short>(Channel::Weights, ftl::rgbd::Format<short>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			out.createTexture<uint8_t>(Channel::Mask, ftl::rgbd::Format<uint8_t>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			in.createTexture<uchar4>(Channel::Support1),
+			in.createTexture<float>(Channel::Depth),
+			in.getLeftCamera(),
+			in.get<cv::cuda::GpuMat>(Channel::Depth).size(),
+			params, stream
+		);
+	}
 
 	return true;
 }
diff --git a/components/operators/src/weighting.cu b/components/operators/src/weighting.cu
index 286207f391a6f90259b813150e4a26286f245ec8..ccb5d7f4e31389edd17a22a139b9391d40b16068 100644
--- a/components/operators/src/weighting.cu
+++ b/components/operators/src/weighting.cu
@@ -14,6 +14,7 @@ __device__ inline float square(float v) { return v*v; }
 __global__ void pixel_weight_kernel(
 		ftl::cuda::TextureObject<short> weight_out,
 		ftl::cuda::TextureObject<uint8_t> mask_out,
+		ftl::cuda::TextureObject<half4> normals_out,
 		ftl::cuda::TextureObject<uchar4> support,
 		ftl::cuda::TextureObject<float> depth, 
 		ftl::rgbd::Camera camera,
@@ -63,6 +64,7 @@ __global__ void pixel_weight_kernel(
 
 					if(l > 0.0f) {
 						n = n / -l;
+						if (normals_out.isValid()) normals_out(x,y) = make_half4(n, 0.0f);
 						float3 ray = camera.screenToCam(x, y, d);
 						ray = ray / length(ray);
 						weight *= min(1.0f, 1.4*dot(ray,n));
@@ -115,7 +117,28 @@ void ftl::cuda::pixel_weighting(
 	const dim3 gridSize((size.width + T_PER_BLOCK - 1)/T_PER_BLOCK, (size.height + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	pixel_weight_kernel<<<gridSize, blockSize, 0, stream>>>(weight_out, mask_out, support, depth, camera, size, params);
+	pixel_weight_kernel<<<gridSize, blockSize, 0, stream>>>(weight_out, mask_out, ftl::cuda::TextureObject<half4>(), support, depth, camera, size, params);
+	cudaSafeCall( cudaGetLastError() );
+
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	#endif
+}
+
+void ftl::cuda::pixel_weighting(
+		ftl::cuda::TextureObject<short> &weight_out,
+		ftl::cuda::TextureObject<ftl::cuda::Mask::type> &mask_out,
+		ftl::cuda::TextureObject<half4> &normals_out,
+		ftl::cuda::TextureObject<uchar4> &support,
+		ftl::cuda::TextureObject<float> &depth,
+		const ftl::rgbd::Camera &camera,
+		const cv::Size size, const ftl::cuda::PixelWeightingParameters &params,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((size.width + T_PER_BLOCK - 1)/T_PER_BLOCK, (size.height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	pixel_weight_kernel<<<gridSize, blockSize, 0, stream>>>(weight_out, mask_out, normals_out, support, depth, camera, size, params);
 	cudaSafeCall( cudaGetLastError() );
 
 	#ifdef _DEBUG
diff --git a/components/operators/src/weighting_cuda.hpp b/components/operators/src/weighting_cuda.hpp
index 7cd544012db66c59c578c808e3abc3351d9c54b1..782aae3df36b214f2962093890c0b8a72b3f1ef9 100644
--- a/components/operators/src/weighting_cuda.hpp
+++ b/components/operators/src/weighting_cuda.hpp
@@ -26,6 +26,16 @@ void pixel_weighting(
 		const cv::Size size, const ftl::cuda::PixelWeightingParameters &params,
 		cudaStream_t stream);
 
+void pixel_weighting(
+		ftl::cuda::TextureObject<short> &weight_out,
+		ftl::cuda::TextureObject<ftl::cuda::Mask::type> &mask_out,
+		ftl::cuda::TextureObject<half4> &normals_out,
+		ftl::cuda::TextureObject<uchar4> &support,
+		ftl::cuda::TextureObject<float> &depth,
+		const ftl::rgbd::Camera &camera,
+		const cv::Size size, const ftl::cuda::PixelWeightingParameters &params,
+		cudaStream_t stream);
+
 void cull_weight(
 		ftl::cuda::TextureObject<short> &weights,
 		ftl::cuda::TextureObject<float> &depth,
diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp
index c346b1809182aa0815e7140a54efbda6397c385b..799b990f26d28acc533f38b0739b30b2260deaca 100644
--- a/components/renderers/cpp/src/CUDARender.cpp
+++ b/components/renderers/cpp/src/CUDARender.cpp
@@ -519,9 +519,7 @@ void CUDARender::_updateParameters(ftl::rgbd::Frame &out) {
 }
 
 void CUDARender::_preprocessColours() {
-	bool show_discon = value("show_discontinuity_mask", false);
-	bool show_noise = value("show_noise_mask", false);
-	bool show_fill = value("show_filled", false);
+	uint8_t show_mask = value("show_mask", 0);
 	bool colour_sources = value("colour_sources", false);
 
 	cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream_);
@@ -536,14 +534,8 @@ void CUDARender::_preprocessColours() {
 		}
 
 		if (f.hasChannel(Channel::Mask)) {
-			if (show_noise) {
-				ftl::cuda::show_mask(f.getTexture<uchar4>(Channel::Colour), f.getTexture<uint8_t>(Channel::Mask), Mask::kMask_Noise, make_uchar4(0,255,0,255), stream_);
-			}
-			if (show_discon) {
-				ftl::cuda::show_mask(f.getTexture<uchar4>(Channel::Colour), f.getTexture<uint8_t>(Channel::Mask), Mask::kMask_Discontinuity, make_uchar4(0,0,255,255), stream_);
-			}
-			if (show_fill) {
-				ftl::cuda::show_mask(f.getTexture<uchar4>(Channel::Colour), f.getTexture<uint8_t>(Channel::Mask), Mask::kMask_Filled, make_uchar4(0,255,0,255), stream_);
+			if (show_mask > 0) {
+				ftl::cuda::show_mask(f.getTexture<uchar4>(Channel::Colour), f.getTexture<uint8_t>(Channel::Mask), show_mask, make_uchar4(255,0,255,255), stream_);
 			}
 		}