From e98725a4c2784a165835f1cd559cfaa7dd6939ee Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Thu, 20 Feb 2020 13:29:05 +0200
Subject: [PATCH] Improvements to depth alignment

---
 applications/gui/src/camera.cpp               |   6 +-
 applications/gui/src/src_window.cpp           |   2 +-
 .../common/cpp/include/ftl/cuda_half.hpp      |   5 +
 .../common/cpp/include/ftl/cuda_texture.hpp   |   9 +
 components/common/cpp/include/ftl/traits.hpp  |   1 +
 .../operators/include/ftl/operators/mvmls.hpp |   2 +
 .../operators/src/fusion/correspondence.cu    | 545 +++++++++++++-----
 .../src/fusion/correspondence_common.hpp      |  16 +-
 .../src/fusion/correspondence_depth.cu        |  41 +-
 .../src/fusion/correspondence_util.cu         | 108 +++-
 components/operators/src/fusion/mvmls.cpp     | 155 ++++-
 .../operators/src/fusion/mvmls_cuda.hpp       |  48 ++
 12 files changed, 763 insertions(+), 175 deletions(-)

diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index 961728d88..15e9ce265 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -347,10 +347,14 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) {
 
 			// FIXME: Should perhaps remain locked until after end is called?
 			// Definitely: causes flashing if not.
-			UNIQUE_LOCK(fs->mtx,lk);
+			//UNIQUE_LOCK(fs->mtx,lk);
+			fs->mtx.lock();
 			renderer_->submit(fs, Channels<0>(channel_), transforms_[fs->id]);
 		}
 		renderer_->end();
+		for (auto *fs : fss) {
+			fs->mtx.unlock();
+		}
 	}
 
 	if (!post_pipe_) {
diff --git a/applications/gui/src/src_window.cpp b/applications/gui/src/src_window.cpp
index 1fb77074c..bf3712995 100644
--- a/applications/gui/src/src_window.cpp
+++ b/applications/gui/src/src_window.cpp
@@ -237,8 +237,8 @@ void SourceWindow::_checkFrameSets(int id) {
 		p->append<ftl::operators::CullWeight>("remove_weights")->value("enabled", false);
 		p->append<ftl::operators::DegradeWeight>("degrade");
 		p->append<ftl::operators::VisCrossSupport>("viscross")->set("enabled", false);
-		p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false);
 		p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity");
+		p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false);
 
 		pre_pipelines_.push_back(p);
 		framesets_.push_back(new ftl::rgbd::FrameSet);
diff --git a/components/common/cpp/include/ftl/cuda_half.hpp b/components/common/cpp/include/ftl/cuda_half.hpp
index f02b2d5ba..d35be793b 100644
--- a/components/common/cpp/include/ftl/cuda_half.hpp
+++ b/components/common/cpp/include/ftl/cuda_half.hpp
@@ -15,6 +15,11 @@ __host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half4>() {
 	return cudaCreateChannelDesc<short4>();
 }
 
+template <>
+__host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half>() {
+	return cudaCreateChannelDesc<short>();
+}
+
 #ifdef __CUDACC__
 
 // half4 functions
diff --git a/components/common/cpp/include/ftl/cuda_texture.hpp b/components/common/cpp/include/ftl/cuda_texture.hpp
index fa6c41866..c795cd4f7 100644
--- a/components/common/cpp/include/ftl/cuda_texture.hpp
+++ b/components/common/cpp/include/ftl/cuda_texture.hpp
@@ -15,6 +15,7 @@ template <> struct Float<uchar4> { typedef float4 type; };
 template <> struct Float<uint8_t> { typedef float type; };
 template <> struct Float<short2> { typedef float2 type; };
 template <> struct Float<short> { typedef float type; };
+template <> struct Float<half> { typedef float type; };
 
 template <typename T>
 struct ScaleValue;
@@ -25,6 +26,7 @@ template <> struct ScaleValue<float> { static constexpr float value = 1.0f; };
 template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; };
 template <> struct ScaleValue<half4> { static constexpr float value = 1.0f; };
 template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; };
+template <> struct ScaleValue<half> { static constexpr float value = 1.0f; };
 
 /**
  * Represent a CUDA texture object. Instances of this class can be used on both
@@ -110,12 +112,19 @@ class TextureObject : public TextureObjectBase {
 
 	#ifdef __CUDACC__
 	__device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); }
+	__device__ inline T tex2D(const int2 &p) const { return ::tex2D<T>(texobj_, p.x, p.y); }
+	__device__ inline T tex2D(const short2 &p) const { return ::tex2D<T>(texobj_, p.x, p.y); }
 	__device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); }
 	__device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; }
+	__device__ inline typename Float<T>::type tex2D(const float2 &p) const { return ::tex2D<typename Float<T>::type>(texobj_, p.x, p.y) * ScaleValue<T>::value; }
 	#endif
 
 	__host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+	__host__ __device__ inline const T &operator()(const int2 &p) const { return reinterpret_cast<T*>(ptr_)[p.x+p.y*pitch2_]; }
+	__host__ __device__ inline const T &operator()(const float2 &p) const { return reinterpret_cast<T*>(ptr_)[int(p.x+0.5f)+int(p.y+0.5f)*pitch2_]; }
 	__host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+	__host__ __device__ inline T &operator()(const int2 &p) { return reinterpret_cast<T*>(ptr_)[p.x+p.y*pitch2_]; }
+	__host__ __device__ inline T &operator()(const float2 &p) { return reinterpret_cast<T*>(ptr_)[int(p.x+0.5f)+int(p.y+0.5f)*pitch2_]; }
 
 	/**
 	 * Cast a base texture object to this type of texture object. If the
diff --git a/components/common/cpp/include/ftl/traits.hpp b/components/common/cpp/include/ftl/traits.hpp
index 8abf07fc0..df44e36c7 100644
--- a/components/common/cpp/include/ftl/traits.hpp
+++ b/components/common/cpp/include/ftl/traits.hpp
@@ -39,6 +39,7 @@ template <> struct OpenCVType<float2> { static constexpr int value = CV_32FC2; }
 template <> struct OpenCVType<float3> { static constexpr int value = CV_32FC3; };
 template <> struct OpenCVType<float4> { static constexpr int value = CV_32FC4; };
 template <> struct OpenCVType<half4> { static constexpr int value = CV_16FC4; };
+template <> struct OpenCVType<half> { static constexpr int value = CV_16F; };
 
 }
 }
diff --git a/components/operators/include/ftl/operators/mvmls.hpp b/components/operators/include/ftl/operators/mvmls.hpp
index 824d88e3d..3e5c476d6 100644
--- a/components/operators/include/ftl/operators/mvmls.hpp
+++ b/components/operators/include/ftl/operators/mvmls.hpp
@@ -20,6 +20,8 @@ class MultiViewMLS : public ftl::operators::Operator {
 	std::vector<ftl::cuda::TextureObject<float4>*> centroid_vert_;
 	std::vector<ftl::cuda::TextureObject<half4>*> normals_horiz_;
     std::vector<ftl::cuda::TextureObject<float>*> contributions_;
+
+	ftl::cuda::TextureObject<half> costs_[2];
 };
 
 }
diff --git a/components/operators/src/fusion/correspondence.cu b/components/operators/src/fusion/correspondence.cu
index 2df21a24d..b19a84af9 100644
--- a/components/operators/src/fusion/correspondence.cu
+++ b/components/operators/src/fusion/correspondence.cu
@@ -48,12 +48,12 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams &params,
 }*/
 
 __device__ inline float distance(float4 p1, float4 p2) {
-	return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 10.0f);
+	return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 255.0f);
 	//return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f);
 }
 
 __device__ inline int halfWarpCensus(float e) {
-	float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16 : 0, WARP_SIZE);
+	float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16+6 : 0+6, WARP_SIZE);
 	int c = (e > e0) ? 1 << (threadIdx.x % 16) : 0;
 	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
 		const int other = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
@@ -90,172 +90,443 @@ __device__ float dissimilarity(const float4 &l, const float4 &rp, const float4 &
 	return min(min(rpd, rnd), rcd);
 }
 
+/**
+ * One thread from each half warp writes a cost value to the correct cost
+ * memory location.
+ */
+__device__ inline void writeCost(TextureObject<half> &costs, int x, int y, int i, float cost) {
+	if ((threadIdx.x & 0xF) == 0) costs(x>>2, y>>2 + (costs.height()/16)*i) = __float2half(cost);
+}
+
+__device__ inline float readCost(TextureObject<half> &costs, int x, int y, int i) {
+	return __half2float(costs(x>>2, y>>2 + (costs.height()/16)*i));
+}
 
+template <int SIZE>
+__device__ inline void writeCost(float (&costs)[SIZE][WARP_SIZE*2], int i, float cost) {
+	costs[i][threadIdx.x + threadIdx.y*WARP_SIZE] = cost;
+}
+
+template <int SIZE>
+__device__ inline float readCost(const float (&costs)[SIZE][WARP_SIZE*2], int i) {
+	return costs[i][threadIdx.x + threadIdx.y*WARP_SIZE];
+}
+
+__device__ inline void writeCost(float (&costs)[WARP_SIZE*2], float cost) {
+	costs[threadIdx.x + threadIdx.y*WARP_SIZE] = cost;
+}
 
-template<int COR_STEPS, int FUNCTION> 
-__global__ void corresponding_point_kernel(
+__device__ inline float readCost(const float (&costs)[WARP_SIZE*2]) {
+	return costs[threadIdx.x + threadIdx.y*WARP_SIZE];
+}
+
+template<int COR_STEPS> 
+__global__ void corresponding_colour_epi_kernel(
+		TextureObject<uchar4> colour1,
+		TextureObject<uchar4> colour2,
         TextureObject<float> d1,
         TextureObject<float> d2,
-        TextureObject<uchar4> c1,
-        TextureObject<uchar4> c2,
         TextureObject<short2> screenOut,
 		TextureObject<float> conf,
-		TextureObject<uint8_t> mask,
+		//TextureObject<uint8_t> mask,
+		//TextureObject<half> costs,
         float4x4 pose,
         Camera cam1,
         Camera cam2, ftl::cuda::MvMLSParams params) {
+
+	__shared__ float match_costs[COR_STEPS][WARP_SIZE*2];
+	__shared__ float aggr_costs_f[COR_STEPS][WARP_SIZE*2];
+	__shared__ float aggr_costs_b[COR_STEPS][WARP_SIZE*2];
+	__shared__ float min_costs[WARP_SIZE*2];
+	
+	const int2 pt = block4x4<2>();
+
+	// FIXME: Don't return like this due to warp operations
+    if (pt.x >= 0 && pt.y >=0 && pt.x < colour1.width() && pt.y < colour1.height()) {
+		screenOut(pt) = make_short2(-1,-1);
+		conf(pt) = PINF;
+    
+		const float depth1 = d1.tex2D(pt);
+		// TODO: If all depths within a half warp here are bad then can return
+		// early.
+		//if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return;
+
+        short2 bestScreen = make_short2(-1,-1);
+		float bestcost = PINF;
+		float bestcost2 = PINF;
+		
+		const float3 camPosOrigin = pose * cam1.screenToCam(pt.x,pt.y,depth1);
+        const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
+        const float3 camPosDistant = pose * cam1.screenToCam(pt.x,pt.y,depth1 + 0.4f);
+        const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
+		const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
+		const float depthM = 0.4f / (lineDistant.x - lineOrigin.x);
+		//const float sub_pixel = depthM / (params.sub_pixel * depth1);
+		const float sub_pixel = params.sub_pixel; //depthM / (((depth1*depth1) / (cam1.fx * cam1.baseline)) / float(COR_STEPS/2));
+        float2 linePos;
+        linePos.x = lineOrigin.x - float((COR_STEPS/2)) * sub_pixel;
+        linePos.y = lineOrigin.y - (float((COR_STEPS/2)) * lineM * sub_pixel);
+
+		// generate census for camera1
+		/*float4 c1_px_cy = colour1.tex2D(float(x)-0.5f,float(y));
+		float4 c1_nx_cy = colour1.tex2D(float(x)+0.5f,float(y));
+		float4 c1_cx_py = colour1.tex2D(float(x),float(y)-0.5f);
+		float4 c1_cx_ny = colour1.tex2D(float(x),float(y)+0.5f);*/
+		float4 c1 = colour1.tex2D(float(pt.x)+0.5f,float(pt.y)+0.5f);
+
+		int bestStep = COR_STEPS/2;
+
+		float avgcost = 0.0f;
+		
+		//writeCost(min_costs, PINF);
+
+		// Generate matching costs
+		#pragma unroll
+        for (int i=0; i<COR_STEPS; ++i) {	
+			//writeCost(aggr_costs_f, i, 0.0f);
+			//writeCost(aggr_costs_b, i, 0.0f);
+
+			//    generate census at that location+0.5f
+			float4 c2 = colour2.tex2D(linePos);
+			/*float d = min(
+				distance(c1_px_cy-c1_nx_cy, c2-c1_nx_cy),
+				distance(c1_cx_py-c1_cx_ny, c2-c1_cx_ny)
+			);*/
+			float d = distance(c1,c2);
+			//d *= d;
+			//d = (ftl::cuda::halfWarpSum(d*d) / 16.0f);
+
+			if (linePos.x < 0 || linePos.x >= cam2.width || linePos.y < 0 || linePos.y >= cam2.height) d = PINF;
+			//if (ftl::cuda::halfWarpMax(d) == PINF) d = PINF;
+
+			writeCost(match_costs, i, d);
+			bestcost = min(bestcost, d);
+			avgcost += d;
+
+			// Move to next correspondence check point
+            linePos.x += sub_pixel*1.0f;
+            linePos.y += sub_pixel*lineM;
+		}
+
+		writeCost(min_costs, bestcost);
+		bestcost = PINF;
+		//avgcost = avgcost / float(COR_STEPS);
+
+		__syncwarp();
+
+		float confidence = 0.0f;
+
+		// Forward aggregate matching costs
+        for (int i=0; i<COR_STEPS; ++i) {
+			float cost = readCost(match_costs,i);
+			float dcost = cost - ((avgcost-cost)/float(COR_STEPS-1));
+			float variance = (dcost < 0.0f) ? square(dcost) : 1.0f;
+			cost /= sqrt(variance);
+			cost = (ftl::cuda::halfWarpSum(cost*cost) / 16.0f);
+
+			cost = ftl::cuda::halfWarpSum(variance) / 16.0f;
+
+			//float cost2 = (i>0) ? readCost(match_costs,i-1) + params.P1 : PINF;
+			//float cost3 = (i<COR_STEPS-1) ? readCost(match_costs,i+1) + params.P1 : PINF;
+			//float cost4 = readCost(min_costs) + params.P2;
+
+			// Instead of the above...
+			// Add the min costs of neighbours +1, 0 and -1 steps
+
+			//float mincost = min(cost,min(cost2,cost3));
+
+			//cost += (ftl::cuda::halfWarpSum(mincost) - mincost) / 15.0f; // - readCost(min_costs); // / 15.0f;
+			//cost /= 2.0f;
+			//writeCost(aggr_costs_f, i, cost);
+			//writeCost(min_costs, min(readCost(min_costs),cost));
+
+			bestStep = (cost < bestcost) ? i : bestStep;
+			bestcost = min(cost, bestcost);
+			//avgcost += cost;
+		}
+
+		confidence = sqrt(confidence / float(COR_STEPS));
+		//confidence = max(0.0f, (fabsf(avgcost-bestcost) / confidence) - 1.0f); // / (3.0f * confidence);
+		bestcost = sqrt(bestcost);
+
+		// Backward aggregate costs
+		//for (int i=COR_STEPS-1; i>=0; --i) {
+			/*float cost = ftl::cuda::halfWarpSum(readCost(match_costs,i));
+			if (i<COR_STEPS-1) cost += ftl::cuda::halfWarpSum(readCost(aggr_costs_b,i+1)+15.0f*10.0f) - readCost(aggr_costs_b,i+1);
+			if (i<COR_STEPS-2) cost += ftl::cuda::halfWarpSum(readCost(aggr_costs_b,i+2)+15.0f*30.0f) - readCost(aggr_costs_b,i+2);
+			float N = (i<COR_STEPS-2) ? 46.0f : (i<COR_STEPS-1) ? 31.0f : 16.0f;
+			writeCost(aggr_costs_b, i, cost / N);*/
+		//}
+
+		//__syncwarp();
+
+		// Find best cost and use that
+		/*for (int i=0; i<COR_STEPS; ++i) {
+			float cost = readCost(aggr_costs_f,i) + readCost(aggr_costs_b,i);
+			bestStep = (cost < bestcost) ? i : bestStep;
+			bestcost = min(cost, bestcost);
+		}*/
+
+		float bestadjust = float(bestStep-(COR_STEPS/2))*depthM*sub_pixel;
+		bestScreen = make_short2(lineOrigin.x + float(bestStep-(COR_STEPS/2))*sub_pixel + 0.5f,
+			lineOrigin.y + float(bestStep-(COR_STEPS/2))*lineM*sub_pixel + 0.5f);
+
+        // Detect matches to boundaries, and discard those
+        //uint stepMask = 1 << bestStep;
+		//if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestcost = PINF;
+
+		// TODO: Validate depth?
+
+		// Check that no boundaries were matched in 4x4 block
+		//bestcost = ftl::cuda::halfWarpMax(bestcost);
+
+		//float confidence = (avgcost == PINF) ? 1.0f : (bestcost / (avgcost/float(COR_STEPS)));
+		
+		// If a match was found then record it
+		if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth) {
+			// Delay making the depth change until later.
+
+			//if (y == 200) printf("Cost: %d\n", bestScreen.x);
+
+			/*if (x % 2 == 0 && y % 2 == 0) {
+				const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+				if (fabs(d2.tex2D(bestScreen.x, bestScreen.y) - (camPosOrigin.z + float(bestStep-(COR_STEPS/2))*depthM2*sub_pixel)) < 0.1f*camPosOrigin.z) {
+					float4 c2 = colour2.tex2D(float(bestScreen.x)+0.5f, float(bestScreen.y)+0.5f);
+					colour1(x,y) = make_uchar4(c2.x, c2.y, c2.z, 0.0f);
+				}
+			}*/
+			
+			const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+			if (fabsf(d2.tex2D(bestScreen) - (camPosOrigin.z + float(bestStep-(COR_STEPS/2))*depthM2*sub_pixel)) < 0.1f*camPosOrigin.z) {
+				if (bestcost < PINF && bestStep != 0 && bestStep != COR_STEPS-1) {
+					conf(pt) = bestcost;
+					//colour1(x,y) = make_uchar4(bestc2.x, bestc2.y, bestc2.z, 0.0f);
+					//mask(x,y) = mask(x,y) | Mask::kMask_Correspondence;
+					screenOut(pt) = bestScreen;
+				}
+			}
+		}
+
+		// Doesn't work
+		//if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost > 0.75f) {
+		//	mask(x,y) = mask(x,y) | Mask::kMask_Bad;
+		//}
+    }
+}
+
+template <int COR_STEPS>
+__global__ void aggregate_colour_costs_kernel(
+		TextureObject<float> d1,
+		TextureObject<float> d2,
+		TextureObject<half> costs1,
+		TextureObject<half> costs2,
+		TextureObject<short2> screenOut,
+		TextureObject<float> conf,
+		float4x4 pose,
+		Camera cam1,
+		Camera cam2,
+		ftl::cuda::MvMLSParams params
+	) {
 	
 	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
 	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
 	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
 
-
-    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
-        screenOut(x,y) = make_short2(-1,-1);
+	// FIXME: Don't return like this due to warp operations
+    if (x >= 0 && y >=0 && x < d1.width() && y < d1.height()) {
+		screenOut(x,y) = make_short2(-1,-1);
+		conf(x,y) = 0.0f;
     
-        //const float3 world1 = make_float3(p1.tex2D(x, y));
-        const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z;  // Initial starting depth
-        if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return;
+		const float depth1 = d1.tex2D(x,y);
+		// TODO: If all depths within a half warp here are bad then can return
+		// early.
+		if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return;
 
-        // TODO: Temporary hack to ensure depth1 is present
-        //const float4 temp = vout.tex2D(x,y);
-        //vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
-        
-        //const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
-
-        const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
-
-        //float bestdepth = 0.0f;
         short2 bestScreen = make_short2(-1,-1);
-		//float bestdepth = 0.0f;
-		//float bestdepth2 = 0.0f;
-        float bestweight = 0.0f;
-        float bestcolour = 0.0f;
-        //float bestdweight = 0.0f;
-        float totalcolour = 0.0f;
-        //int count = 0;
-        //float contrib = 0.0f;
+		float bestcost = PINF;
+		float bestcost2 = PINF;
 		
 		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
         const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
-        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 10.0f);
+        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 0.4f);
         const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
-        const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
-		const float depthM = 10.0f / (lineDistant.x - lineOrigin.x);
+		const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
+		const float depthM = 0.4f / (lineDistant.x - lineOrigin.x);
 		const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+		const float sub_pixel = depthM / (((depth1*depth1) / (cam1.fx * cam1.baseline)) / float(COR_STEPS/2));
+		float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2 * sub_pixel);
         float2 linePos;
-        linePos.x = lineOrigin.x - ((COR_STEPS/2));
-        linePos.y = lineOrigin.y - (((COR_STEPS/2)) * lineM);
-		//float depthPos = depth1 - (float((COR_STEPS/2)) * depthM);
-		float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2);
-		//const float depthCoef = cam1.baseline*cam1.fx;
-
-		const float depthCoef = (1.0f / cam1.fx) * 10.0f;
-        
+        linePos.x = lineOrigin.x - float((COR_STEPS/2)) * sub_pixel;
+        linePos.y = lineOrigin.y - (float((COR_STEPS/2)) * lineM * sub_pixel);
+
         uint badMask = 0;
         int bestStep = COR_STEPS/2;
+		//float bestSource = 0.0f;
 
+		// Scan from -COR_STEPS/2 to +COR_STEPS/2, where COR_STEPS is the number
+		// of pixels to check in the second camera for the current first camera
+		// pixel.
 
-        // Project to p2 using cam2
-        // Each thread takes a possible correspondence and calculates a weighting
-        //const int lane = tid % WARP_SIZE;
+		#pragma unroll
         for (int i=0; i<COR_STEPS; ++i) {			
-			//float weight = 1.0f; //(linePos.x >= cam2.width || linePos.y >= cam2.height) ? 0.0f : 1.0f;
-
-			// Generate a colour correspondence value
-            const auto colour2 = c2.tex2D(linePos.x, linePos.y);
-
-            // TODO: Check if other colour dissimilarities are better...
-            const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
-
-            // Generate a depth correspondence value
-            const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
-            
-            // Record which correspondences are invalid
-            badMask |= (
-					depth2 <= cam2.minDepth ||
-					depth2 >= cam2.maxDepth ||
-					linePos.x < 0.5f ||
-					linePos.y < 0.5f ||
-					linePos.x >= d2.width()-0.5f ||
-					linePos.y >= d2.height()-0.5f
-				) ? 1 << i : 0;
-			
-			//if (FUNCTION == 1) {
-			// TODO: Spatial smooth must be disparity discon threshold of largest depth in original camera space.
-			// ie. if depth1 > depth2 then depth1 has the largest error potential and the resolution at that
-			// depth is used to determine the spatial smoothing amount.
-	
-			const float maxdepth = max(depth1, depth2);
-			const float smooth = depthCoef * maxdepth;
-			float weight = ftl::cuda::weighting(fabs(depth2 - depthPos2), cweight*smooth);
-			//weight = ftl::cuda::halfWarpSum(weight);
-			//} else {
-			//	const float dweight = ftl::cuda::weighting(fabs(depth2 - depthPos2), params.spatial_smooth);
-            //	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-			//}
-            //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
-
-            //weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-
-            //++count;
-
-            bestcolour = max(cweight, bestcolour);
-            totalcolour += cweight;
-
-            //bestdepth = (weight > bestweight) ? depthPos : bestdepth;
-            bestStep = (weight > bestweight) ? i : bestStep;
-			
-			bestweight = max(bestweight, weight);
+			//    generate census at that location+0.5f
+			float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
+			const float sub_pixel2 = (((depth2*depth2) / (cam2.fx * cam2.baseline)) / float(COR_STEPS/2));
+			int index = int((depthPos2-depth2) / sub_pixel2) + COR_STEPS/2;
+
+			float c1 = readCost(costs1, x, y, i);
+			float c2 = (depthPos2 > cam2.minDepth && depthPos2 < cam2.maxDepth && index >= 0 && index < COR_STEPS && linePos.x >= 0.0f && linePos.y >= 0.0f && linePos.x < d2.width()-1 && linePos.y < d2.height()-1) ? readCost(costs2, int(linePos.x+0.5f), int(linePos.y+0.5f), index) : PINF;
+			//writeCost(costs1, x, y, i, c1+c2);
+
+			//if (index >= 0 && index < COR_STEPS && y == 200) printf("Index = %f\n", sub_pixel);
+			float cost = c1+c2;
+
+			// Record the best result
+            bestStep = (cost < bestcost) ? i : bestStep;
+			bestScreen = (cost < bestcost) ? make_short2(linePos.x+0.5f,linePos.y+0.5f) : bestScreen;
+			//bestc2 = (cost < bestcost) ? c2 : bestc2;
+			bestcost = min(bestcost, cost);
                 
 			
-			//depthPos += depthM;
-			depthPos2 += depthM2;
-            linePos.x += 1.0f;
-            linePos.y += lineM;
+			// Move to next correspondence check point
+			depthPos2 += sub_pixel*depthM2;
+            linePos.x += sub_pixel*1.0f;
+            linePos.y += sub_pixel*lineM;
         }
 
-        //const float avgcolour = totalcolour/(float)count;
-        const float confidence = ((bestcolour / totalcolour) - (1.0f / 16.0f)) * (1.0f + (1.0f/16.0f));
-        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM*sub_pixel;
 
         // Detect matches to boundaries, and discard those
-        uint stepMask = 1 << bestStep;
-		if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestweight = 0.0f;
+        //uint stepMask = 1 << bestStep;
+		//if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestcost = PINF;
+
+		// TODO: Validate depth?
+
+		// Check that no boundaries were matched in 4x4 block
+		//bestcost = ftl::cuda::halfWarpMax(bestcost);
+
+		//float confidence = 1.0f - (bestcost2 / bestcost);
 		
-		//bestadjust = halfWarpBest(bestadjust, (bestweight > 0.0f) ? confidence : 0.0f);
-
-        //Mask m(mask.tex2D(x,y));
-
-        //if (bestweight > 0.0f) {
-            float old = conf.tex2D(x,y);
-
-            if (bestweight > 0.0f) {
-				d1(x,y) = (0.4f*bestadjust) + depth1;
-				//d2(bestScreen.x, bestScreen.y) = bestdepth2;
-                //screenOut(x,y) = bestScreen;
-				conf(x,y) = max(old,confidence); //bestweight * confidence;
-				//conf(x,y) = max(old,fabs(bestadjust));
-            }
-        //}
-        
-        // If a good enough match is found, mark dodgy depth as solid
-        //if ((m.isFilled() || m.isDiscontinuity()) && (bestweight > params.match_threshold)) mask(x,y) = 0;
+		// If a match was found then record it
+		if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < PINF) {
+			// Delay making the depth change until later.
+			
+			conf(x,y) = bestadjust;
+			//colour1(x,y) = make_uchar4(bestc2.x, bestc2.y, bestc2.z, 0.0f);
+			//mask(x,y) = mask(x,y) | Mask::kMask_Correspondence;
+			screenOut(x,y) = bestScreen;
+		}
+
+		// Doesn't work
+		//if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost > 0.75f) {
+		//	mask(x,y) = mask(x,y) | Mask::kMask_Bad;
+		//}
     }
 }
 
+/**
+ * Non-epipolar census search.
+ */
+template <int RADIUS>
+__global__ void corresponding_colour_kernel(
+		TextureObject<uchar4> colour1,
+		TextureObject<uchar4> colour2,
+		TextureObject<short2> screen,
+		float4x4 pose,
+		Camera cam1,
+		Camera cam2, ftl::cuda::MvMLSParams params) {
+
+	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16));
+	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
+
+	if (x >= 0 && y >=0 && x < screen.width() && y < screen.height()) {
+		// generate census for camera1
+		float4 c = colour1.tex2D(float(x+0.5f),float(y+0.5f));
+		int3 cen1;
+		cen1.x = halfWarpCensus(c.x);
+		cen1.y = halfWarpCensus(c.y);
+		cen1.z = halfWarpCensus(c.z);
+
+		short2 corPos = screen.tex2D(x,y);
+		if (ftl::cuda::halfWarpMin(int(corPos.x)) < 0) return;
+
+		short2 bestScreen = corPos;
+		float bestcost = PINF;
+
+		// For a perturbation around current correspondence..
+		for (int v=-RADIUS; v<=RADIUS; ++v) {
+		for (int u=-RADIUS; u<=RADIUS; ++u) {
+			//    generate census at that location
+			float4 c = colour2.tex2D(float(corPos.x+u), float(corPos.y+v));
+			int3 cen2;
+			cen2.x = halfWarpCensus(c.x);
+			cen2.y = halfWarpCensus(c.y);
+			cen2.z = halfWarpCensus(c.z);
+
+			//    count bit differences
+			int count = __popc(cen1.x ^ cen2.x) + __popc(cen1.y ^ cen2.y) + __popc(cen1.z ^ cen2.z);
+
+			//    generate cost from bit diffs and amount of movement
+			float cost = float(count) / 48.0f;
+			cost += 1.0f - min(1.0f, sqrt(float(v*v) + float(u*u)) / float(3*RADIUS));
+			bestScreen = (cost < bestcost) ? make_short2(corPos.x+u, corPos.y+v) : bestScreen;
+			bestcost = min(cost, bestcost);
+		}
+		}
+
+		// Adjust correspondence to min cost location
+		screen(x,y) = bestScreen;
+	}
+}
+
+
 void ftl::cuda::correspondence(
-        TextureObject<float> &d1,
+		TextureObject<uchar4> &c1,
+		TextureObject<uchar4> &c2,
+		TextureObject<float> &d1,
         TextureObject<float> &d2,
-        TextureObject<uchar4> &c1,
-        TextureObject<uchar4> &c2,
-        TextureObject<short2> &screen,
+		TextureObject<short2> &screen,
 		TextureObject<float> &conf,
-		TextureObject<uint8_t> &mask,
-        float4x4 &pose2,
-        const Camera &cam1,
-        const Camera &cam2, const MvMLSParams &params, int func,
-        cudaStream_t stream) {
+		//TextureObject<uint8_t> &mask,
+		//TextureObject<half> &costs,
+		float4x4 &pose2,
+		const Camera &cam1,
+		const Camera &cam2, const MvMLSParams &params,
+		int radius,
+		cudaStream_t stream) {
+
+	//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	const dim3 gridSize((c1.width() + 8 - 1)/8, (c1.height() + 8 - 1)/8);
+	const dim3 blockSize(WARP_SIZE, 2);
+
+
+	switch (radius) {
+	case 32: corresponding_colour_epi_kernel<32><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	case 16: corresponding_colour_epi_kernel<16><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	case 8: corresponding_colour_epi_kernel<8><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	case 4: corresponding_colour_epi_kernel<4><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	case 3: corresponding_colour_epi_kernel<3><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	case 2: corresponding_colour_epi_kernel<2><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break;
+	//case 1: corresponding_colour_epi_kernel<1><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, costs, pose2, cam1, cam2, params); break;
+	}
+
+	cudaSafeCall( cudaGetLastError() );
+}
+
+void ftl::cuda::aggregate_colour_costs(
+		TextureObject<float> &d1,
+		TextureObject<float> &d2,
+		TextureObject<half> &costs1,
+		TextureObject<half> &costs2,
+		TextureObject<short2> &screen,
+		TextureObject<float> &conf,
+		float4x4 &pose2,
+		const Camera &cam1,
+		const Camera &cam2, const MvMLSParams &params,
+		int radius,
+		cudaStream_t stream) {
 
 	//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
@@ -263,14 +534,16 @@ void ftl::cuda::correspondence(
 	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(WARP_SIZE, 2);
 
-    
-    switch (func) {
-    case 32: corresponding_point_kernel<32,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 16: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 8: corresponding_point_kernel<8,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 4: corresponding_point_kernel<4,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    case 2: corresponding_point_kernel<2,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
-    }
 
-    cudaSafeCall( cudaGetLastError() );
+	switch (radius) {
+	case 32: aggregate_colour_costs_kernel<32><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	case 16: aggregate_colour_costs_kernel<16><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	case 8: aggregate_colour_costs_kernel<8><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	case 4: aggregate_colour_costs_kernel<4><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	case 3: aggregate_colour_costs_kernel<3><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	case 2: aggregate_colour_costs_kernel<2><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	//case 1: aggregate_colour_costs_kernel<1><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break;
+	}
+
+	cudaSafeCall( cudaGetLastError() );
 }
diff --git a/components/operators/src/fusion/correspondence_common.hpp b/components/operators/src/fusion/correspondence_common.hpp
index 6b9405f47..9f19ef711 100644
--- a/components/operators/src/fusion/correspondence_common.hpp
+++ b/components/operators/src/fusion/correspondence_common.hpp
@@ -14,10 +14,22 @@ __device__ inline bool isBadCor(float depth, const float2 &pos, const ftl::rgbd:
 		depth >= cam.maxDepth ||
 		pos.x < 0.5f ||
 		pos.y < 0.5f ||
-		pos.x >= cam.width-0.5f ||
-		pos.y >= cam.height-0.5f;
+		pos.x >= float(cam.width)-0.5f ||
+		pos.y >= float(cam.height)-0.5f;
 }
 
 __device__ inline float square(float v) { return v*v; }
 
+__device__ inline float length2(const float4 v) {
+	return v.x*v.x + v.y*v.y + v.z*v.z;
+}
+
+template <int BLOCKS>
+__device__ inline int2 block4x4() {
+	return make_int2(
+		(blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)),
+		blockIdx.y*4*BLOCKS + (threadIdx.x%16)/4 + 4*threadIdx.y
+	);
+}
+
 #endif
diff --git a/components/operators/src/fusion/correspondence_depth.cu b/components/operators/src/fusion/correspondence_depth.cu
index 533152617..9f83fa79f 100644
--- a/components/operators/src/fusion/correspondence_depth.cu
+++ b/components/operators/src/fusion/correspondence_depth.cu
@@ -23,17 +23,16 @@ __global__ void corresponding_depth_kernel(
         float4x4 pose,
         Camera cam1,
         Camera cam2, ftl::cuda::MvMLSParams params) {
-	
-	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
-	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
-	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
+
+	const int2 pt = block4x4<2>();
+
 
 	// FIXME: Don't return like this due to warp operations
-    if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
-		screenOut(x,y) = make_short2(-1,-1);
-		conf(x,y) = 0.0f;
+    if (pt.x >= 0 && pt.y >=0 && pt.x < screenOut.width() && pt.y < screenOut.height()) {
+		screenOut(pt) = make_short2(-1,-1);
+		conf(pt) = 0.0f;
     
-		const float depth1 = d1.tex2D(x,y);
+		const float depth1 = d1.tex2D(pt);
 		// TODO: If all depths within a half warp here are bad then can return
 		// early.
 		//if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return;
@@ -41,9 +40,9 @@ __global__ void corresponding_depth_kernel(
         short2 bestScreen = make_short2(-1,-1);
         float bestcost = PINF;
 		
-		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
+		const float3 camPosOrigin = pose * cam1.screenToCam(pt.x,pt.y,depth1);
         const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
-        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 0.4f);
+        const float3 camPosDistant = pose * cam1.screenToCam(pt.x,pt.y,depth1 + 0.4f);
         const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
         const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
 		const float depthM = 0.4f / (lineDistant.x - lineOrigin.x);
@@ -86,7 +85,8 @@ __global__ void corresponding_depth_kernel(
 			// the greater they are from the warp concensus best cost
 
 			// Sum all squared distance errors in a 4x4 pixel neighborhood
-			cost = ftl::cuda::halfWarpSum(cost);
+			float wcost = ftl::cuda::halfWarpSum(cost) / 16.0f;
+			cost = (wcost < 1.0f) ? (wcost + cost) / 2.0f : 1.0f;  // Enables per pixel adjustments
 
 			// Record the best result
             bestStep = (cost < bestcost) ? i : bestStep;
@@ -100,7 +100,11 @@ __global__ void corresponding_depth_kernel(
             linePos.y += lineM;
         }
 
-        float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+		float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+		
+		//if (bestStep == 0 ||bestStep == COR_STEPS-1) {
+		//	printf("Bad step: %f\n", bestcost);
+		//}
 
         // Detect matches to boundaries, and discard those
         uint stepMask = 1 << bestStep;
@@ -110,11 +114,11 @@ __global__ void corresponding_depth_kernel(
 		bestcost = ftl::cuda::halfWarpMax(bestcost);
 		
 		// If a match was found then record it
-		if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < PINF) {
+		if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < 1.0f) {
 			// Delay making the depth change until later.
-			conf(x,y) = bestadjust;
-			mask(x,y) = mask(x,y) | Mask::kMask_Correspondence;
-			screenOut(x,y) = bestScreen;
+			conf(pt) = bestadjust;
+			mask(pt) = mask(pt) | Mask::kMask_Correspondence;
+			screenOut(pt) = bestScreen;
 		}
     }
 }
@@ -131,7 +135,10 @@ void ftl::cuda::correspondence(
 		cudaStream_t stream) {
 
 	// FIXME: Check the grid size makes sense
-	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	//const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	//const dim3 blockSize(WARP_SIZE, 2);
+
+	const dim3 gridSize((d1.width() + 8 - 1)/8, (d1.height() + 8 - 1)/8);
 	const dim3 blockSize(WARP_SIZE, 2);
 
 	switch (func) {
diff --git a/components/operators/src/fusion/correspondence_util.cu b/components/operators/src/fusion/correspondence_util.cu
index ca1b8a82a..de91eabe6 100644
--- a/components/operators/src/fusion/correspondence_util.cu
+++ b/components/operators/src/fusion/correspondence_util.cu
@@ -44,7 +44,8 @@ void ftl::cuda::zero_confidence(TextureObject<float> &conf, TextureObject<float>
 __global__ void show_cor_error_kernel(
 		TextureObject<uchar4> colour,
 		TextureObject<short2> screen1,
-		TextureObject<short2> screen2) {
+		TextureObject<short2> screen2,
+		float thresh) {
 
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -56,18 +57,51 @@ __global__ void show_cor_error_kernel(
 			short2 s2 = screen2.tex2D(s1.x, s1.y);
 
 			float l = sqrt(square(s2.x-x) + square(s2.y-y));
-			float nl = min(1.0f, l/5.0f);
+			float nl = min(1.0f, l/thresh);
 			//colour(x,y) = (l < 1.0f) ? make_uchar4(0,255,0,0) : (s2.x < 0) ? make_uchar4(255.0f, 0.0f, 0.0f, 0.0f) : make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f);
 			if (nl < 1.0f && s2.x >= 0) colour(x,y) = make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f);
 		}
 	}
 }
 
-void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<short2> &screen1, TextureObject<short2> &screen2, cudaStream_t stream) {
+void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<short2> &screen1, TextureObject<short2> &screen2, float thresh, cudaStream_t stream) {
 	const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	show_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen1, screen2);
+	show_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen1, screen2, thresh);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Remove correspondence errors ===========================================
+
+__global__ void remove_cor_error_kernel(
+		TextureObject<float> adjust,
+		TextureObject<short2> screen1,
+		TextureObject<short2> screen2, float thresh) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < adjust.width() && y < adjust.height()) {
+		short2 s1 = screen1.tex2D(x,y);
+
+		if (s1.x >= 0 && s1.x < screen2.width() && s1.y < screen2.height()) {
+			short2 s2 = screen2.tex2D(s1.x, s1.y);
+
+			float l = sqrt(square(s2.x-x) + square(s2.y-y));
+			if (l >= thresh || s2.x < 0) {
+				adjust(x,y) = PINF;
+				screen1(x,y) = make_short2(-1,-1);
+			}
+		}
+	}
+}
+
+void ftl::cuda::remove_cor_error(TextureObject<float> &adjust, TextureObject<short2> &screen1, TextureObject<short2> &screen2, float thresh, cudaStream_t stream) {
+	const dim3 gridSize((adjust.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (adjust.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	remove_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(adjust, screen1, screen2, thresh);
 	cudaSafeCall( cudaGetLastError() );
 }
 
@@ -76,25 +110,79 @@ void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<shor
 
 __global__ void show_depth_adjust_kernel(
 		TextureObject<uchar4> colour,
-		TextureObject<float> adjust) {
+		TextureObject<short2> screen,
+		TextureObject<float> adjust, float scale) {
 
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
 	if (x < colour.width() && y < colour.height()) {
 		float a = adjust.tex2D(x,y);
+		short2 s = screen.tex2D(x,y);
 
-		if (a != 0.0f) {
-			float nc = min(1.0f, fabsf(a)/0.04f);
-			colour(x,y) = make_uchar4(0.0f, (1.0f-nc)*255.0f, nc*255.0f, 0.0f);
+		if (s.x >= 0) {
+			float ncG = min(1.0f, fabsf(a)/scale);
+			float ncB = -max(-1.0f, min(0.0f, a/scale));
+			float ncR = max(0.0f, min(1.0f, a/scale));
+			colour(x,y) = make_uchar4(ncB*255.0f, (1.0f-ncG)*255.0f, ncR*255.0f, 0.0f);
 		}
 	}
 }
 
-void ftl::cuda::show_depth_adjustment(TextureObject<uchar4> &colour, TextureObject<float> &adjust, cudaStream_t stream) {
+void ftl::cuda::show_depth_adjustment(TextureObject<uchar4> &colour, TextureObject<short2> &screen, TextureObject<float> &adjust, float scale, cudaStream_t stream) {
 	const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
 	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	show_depth_adjust_kernel<<<gridSize, blockSize, 0, stream>>>(colour, adjust);
+	show_depth_adjust_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen, adjust, scale);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Vis reprojection =======================================================
+
+__global__ void viz_reprojection_kernel(
+		TextureObject<uchar4> colour_out,
+		TextureObject<uchar4> colour_in,
+		TextureObject<float> depth_out,
+		TextureObject<float> depth_in,
+		float4x4 pose,
+		Camera cam1,
+		Camera cam2) {
+
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < colour_in.width() && y < colour_in.height()) {
+		float d = depth_in.tex2D(x,y);
+		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,d);
+        const uint2 p = cam2.camToScreen<uint2>(camPosOrigin);
+
+		if (p.x < colour_out.width() && p.y < colour_out.height()) {
+			float dout = depth_out(p.x, p.y);
+			uchar4 cin = colour_in(x,y);
+			uchar4 cout = colour_out(p.x, p.y);
+
+			if (fabs(dout-camPosOrigin.z) < 0.1f) {
+				colour_out(p.x, p.y) = make_uchar4(
+					(int(cin.x)+int(cout.x)) / 2,
+					(int(cin.y)+int(cout.y)) / 2,
+					(int(cin.z)+int(cout.z)) / 2,
+					0);
+			}
+		}
+	}
+}
+
+void ftl::cuda::viz_reprojection(
+		TextureObject<uchar4> &colour_out,
+		TextureObject<uchar4> &colour_in,
+		TextureObject<float> &depth_out,
+		TextureObject<float> &depth_in,
+		const float4x4 &pose,
+		const Camera &cam1,
+		const Camera &cam2, cudaStream_t stream) {
+	const dim3 gridSize((colour_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	viz_reprojection_kernel<<<gridSize, blockSize, 0, stream>>>(colour_out, colour_in, depth_out, depth_in, pose, cam1, cam2);
 	cudaSafeCall( cudaGetLastError() );
 }
diff --git a/components/operators/src/fusion/mvmls.cpp b/components/operators/src/fusion/mvmls.cpp
index 7d7ef03d9..55b992957 100644
--- a/components/operators/src/fusion/mvmls.cpp
+++ b/components/operators/src/fusion/mvmls.cpp
@@ -2,6 +2,7 @@
 #include "smoothing_cuda.hpp"
 #include <ftl/utility/matrix_conversion.hpp>
 #include "mvmls_cuda.hpp"
+#include <ftl/cuda/normals.hpp>
 
 #include <opencv2/cudaarithm.hpp>
 
@@ -32,12 +33,16 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
     int win = config()->value("window_size",16);
     bool do_corr = config()->value("merge_corresponding", true);
 	bool do_aggr = config()->value("merge_mls", false);
+    bool do_colour_adjust = config()->value("apply_colour_adjust", false);
 	bool cull_zero = config()->value("cull_no_confidence", false);
     //bool show_best_source = config()->value("show_pixel_source", false);
 
     ftl::cuda::MvMLSParams params;
     params.colour_smooth = config()->value("colour_smooth", 150.0f);
     params.spatial_smooth = config()->value("spatial_smooth", 1.0f);
+	params.sub_pixel = config()->value("sub_pixel", 0.5f);
+	params.P1 = config()->value("P1", 0.1f);
+	params.P2 = config()->value("P2", 0.2f);
 
 	bool show_consistency = config()->value("show_consistency", false);
 	bool show_adjustment = config()->value("show_adjustment", false);
@@ -142,19 +147,143 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
                             stream
                         );
 
-						if (show_consistency) {
-							ftl::cuda::show_cor_error(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f2.getTexture<short2>(Channel::Screen), stream);
-							ftl::cuda::show_cor_error(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f1.getTexture<short2>(Channel::Screen), stream);
-						}
+						/*ftl::cuda::remove_cor_error(
+							f1.getTexture<float>(Channel::Confidence),
+							f1.getTexture<short2>(Channel::Screen),
+							f2.getTexture<short2>(Channel::Screen),
+							2.0f,
+							stream);
 
-						if (show_adjustment) {
-							ftl::cuda::show_depth_adjustment(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<float>(Channel::Confidence), stream);
-							ftl::cuda::show_depth_adjustment(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<float>(Channel::Confidence), stream);
-						}
+						ftl::cuda::remove_cor_error(
+							f2.getTexture<float>(Channel::Confidence),
+							f2.getTexture<short2>(Channel::Screen),
+							f1.getTexture<short2>(Channel::Screen),
+							2.0f,
+							stream);*/
 
 						// Actually perform the adjustment
 						cv::cuda::add(f1.get<GpuMat>(Channel::Depth), f1.get<GpuMat>(Channel::Confidence), f1.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
 						cv::cuda::add(f2.get<GpuMat>(Channel::Depth), f2.get<GpuMat>(Channel::Confidence), f2.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
+
+						/*ftl::cuda::viz_reprojection(
+                            f1.getTexture<uchar4>(Channel::Colour),
+                            f2.getTexture<uchar4>(Channel::Colour),
+							f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<float>(Channel::Depth),
+                            pose,
+                            f2.getLeftCamera(),
+                            f1.getLeftCamera(),
+                            stream
+                        );
+						ftl::cuda::viz_reprojection(
+                            f2.getTexture<uchar4>(Channel::Colour),
+                            f1.getTexture<uchar4>(Channel::Colour),
+							f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<float>(Channel::Depth),
+                            pose2,
+                            f1.getLeftCamera(),
+                            f2.getLeftCamera(),
+                            stream
+                        );*/
+						//continue;
+
+                        //auto &g1 = f1.get<GpuMat>(Channel::Depth);
+                        //costs_[0].create(g1.cols/4, (g1.rows/4) * 16);
+                        //auto &g2 = f2.get<GpuMat>(Channel::Depth);
+                        //costs_[1].create(g2.cols/4, (g2.rows/4) * 16);
+
+						/*ftl::cuda::correspondence(
+                            f1.getTexture<uchar4>(Channel::Colour),
+                            f2.getTexture<uchar4>(Channel::Colour),
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<short2>(Channel::Screen),
+                            f1.getTexture<float>(Channel::Confidence),
+                            //f1.getTexture<uint8_t>(Channel::Mask),
+                            //costs_[0],
+                            pose2,
+                            f1.getLeftCamera(),
+                            f2.getLeftCamera(),
+                            params,
+                            16,
+                            stream
+                        );
+                        ftl::cuda::correspondence(
+                            f2.getTexture<uchar4>(Channel::Colour),
+                            f1.getTexture<uchar4>(Channel::Colour),
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<short2>(Channel::Screen),
+                            f2.getTexture<float>(Channel::Confidence),
+                            //f2.getTexture<uint8_t>(Channel::Mask),
+                            //costs_[1],
+                            pose,
+                            f2.getLeftCamera(),
+                            f1.getLeftCamera(),
+                            params,
+                            16,
+                            stream
+                        );*/
+
+                        /*ftl::cuda::aggregate_colour_costs(
+                            f1.getTexture<float>(Channel::Depth),
+                            f2.getTexture<float>(Channel::Depth),
+                            costs_[0],
+                            costs_[1],
+                            f1.getTexture<short2>(Channel::Screen),
+                            f1.getTexture<float>(Channel::Confidence),
+                            pose2,
+                            f1.getLeftCamera(),
+                            f2.getLeftCamera(),
+                            params,
+                            16,
+                            stream
+                        );
+
+                        ftl::cuda::aggregate_colour_costs(
+                            f2.getTexture<float>(Channel::Depth),
+                            f1.getTexture<float>(Channel::Depth),
+                            costs_[1],
+                            costs_[0],
+                            f2.getTexture<short2>(Channel::Screen),
+                            f2.getTexture<float>(Channel::Confidence),
+                            pose,
+                            f2.getLeftCamera(),
+                            f1.getLeftCamera(),
+                            params,
+                            16,
+                            stream
+                        );*/
+
+						if (show_consistency) {
+							ftl::cuda::show_cor_error(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f2.getTexture<short2>(Channel::Screen), 5.0f, stream);
+							ftl::cuda::show_cor_error(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f1.getTexture<short2>(Channel::Screen), 5.0f, stream);
+						}
+
+						/*ftl::cuda::remove_cor_error(
+							f1.getTexture<float>(Channel::Confidence),
+							f1.getTexture<short2>(Channel::Screen),
+							f2.getTexture<short2>(Channel::Screen),
+							5.0f,
+							stream);
+
+						ftl::cuda::remove_cor_error(
+							f2.getTexture<float>(Channel::Confidence),
+							f2.getTexture<short2>(Channel::Screen),
+							f1.getTexture<short2>(Channel::Screen),
+							5.0f,
+							stream);*/
+
+						// Actually perform the colour adjustment
+                        //if (do_colour_adjust) {
+						//    cv::cuda::add(f1.get<GpuMat>(Channel::Depth), f1.get<GpuMat>(Channel::Confidence), f1.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
+						//    cv::cuda::add(f2.get<GpuMat>(Channel::Depth), f2.get<GpuMat>(Channel::Confidence), f2.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream);
+                        //}
+
+						if (show_adjustment) {
+							ftl::cuda::show_depth_adjustment(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f1.getTexture<float>(Channel::Confidence), 0.04f, stream);
+							ftl::cuda::show_depth_adjustment(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f2.getTexture<float>(Channel::Confidence), 0.04f, stream);
+						}
                     //} //else {
                         /*ftl::cuda::correspondence(
                             f1.getTexture<float>(Channel::Depth),
@@ -207,6 +336,16 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 
             // Reduce window size for next iteration
             //win = max(win>>1, 4);
+
+            // Redo normals
+            for (size_t i=0; i<in.frames.size(); ++i) {
+                auto &f = in.frames[i];
+                ftl::cuda::normals(
+                    f.getTexture<half4>(Channel::Normals),
+                    f.getTexture<float>(Channel::Depth),
+                    f.getLeftCamera(), stream
+                );
+            }
 		}
 
 	for (int iter=0; iter<iters; ++iter) {	
diff --git a/components/operators/src/fusion/mvmls_cuda.hpp b/components/operators/src/fusion/mvmls_cuda.hpp
index 4ee589e0d..4571e2df4 100644
--- a/components/operators/src/fusion/mvmls_cuda.hpp
+++ b/components/operators/src/fusion/mvmls_cuda.hpp
@@ -11,6 +11,9 @@ namespace cuda {
 struct MvMLSParams {
     float spatial_smooth;
     float colour_smooth;
+	float sub_pixel;
+	float P1;
+	float P2;
 };
 
 void correspondence(
@@ -26,6 +29,20 @@ void correspondence(
         const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
         cudaStream_t stream);
 
+void correspondence(
+        ftl::cuda::TextureObject<uchar4> &c1,
+        ftl::cuda::TextureObject<uchar4> &c2,
+        ftl::cuda::TextureObject<float> &d1,
+        ftl::cuda::TextureObject<float> &d2,
+        ftl::cuda::TextureObject<short2> &screen,
+		ftl::cuda::TextureObject<float> &conf,
+		//ftl::cuda::TextureObject<uint8_t> &mask,
+        //ftl::cuda::TextureObject<half> &costs,
+        float4x4 &pose,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int radius,
+        cudaStream_t stream);
+
 void correspondence(
         ftl::cuda::TextureObject<float> &d1,
         ftl::cuda::TextureObject<float> &d2,
@@ -37,6 +54,18 @@ void correspondence(
         const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
         cudaStream_t stream);
 
+void aggregate_colour_costs(
+        ftl::cuda::TextureObject<float> &d1,
+        ftl::cuda::TextureObject<float> &d2,
+        ftl::cuda::TextureObject<half> &costs1,
+        ftl::cuda::TextureObject<half> &costs2,
+        ftl::cuda::TextureObject<short2> &screen,
+		ftl::cuda::TextureObject<float> &conf,
+        float4x4 &pose,
+        const ftl::rgbd::Camera &cam1,
+        const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int radius,
+        cudaStream_t stream);
+
 void zero_confidence(
 		ftl::cuda::TextureObject<float> &conf,
 		ftl::cuda::TextureObject<float> &depth,
@@ -101,13 +130,32 @@ void show_cor_error(
 	ftl::cuda::TextureObject<uchar4> &colour,
 	ftl::cuda::TextureObject<short2> &screen1,
 	ftl::cuda::TextureObject<short2> &screen2,
+	float thresh,
+	cudaStream_t stream);
+
+void remove_cor_error(
+	ftl::cuda::TextureObject<float> &adjust,
+	ftl::cuda::TextureObject<short2> &screen1,
+	ftl::cuda::TextureObject<short2> &screen2,
+	float thresh,
 	cudaStream_t stream);
 
 void show_depth_adjustment(
 	ftl::cuda::TextureObject<uchar4> &colour,
+    ftl::cuda::TextureObject<short2> &screen,
 	ftl::cuda::TextureObject<float> &adjust,
+	float scale,
 	cudaStream_t stream);
 
+void viz_reprojection(
+	ftl::cuda::TextureObject<uchar4> &colour_out,
+	ftl::cuda::TextureObject<uchar4> &colour_in,
+	ftl::cuda::TextureObject<float> &depth_out,
+	ftl::cuda::TextureObject<float> &depth_in,
+	const float4x4 &pose,
+	const ftl::rgbd::Camera &cam1,
+	const ftl::rgbd::Camera &cam2, cudaStream_t stream);
+
 }
 }
 
-- 
GitLab