diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index 9d8dac421d0189c978dbc0312302f2b34349945c..fb5777740397e855f291258826655125eb620ca3 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -339,10 +339,10 @@ static void visualizeEnergy(	const cv::Mat &depth, cv::Mat &out,
 	depth.convertTo(out, CV_8U, 255.0f / max_depth);
 	//out = 255 - out;
-	cv::Mat mask = (depth >= 39.0f); // TODO (mask for invalid pixels)
+	//cv::Mat mask = (depth >= 39.0f); // TODO (mask for invalid pixels)
 	applyColorMap(out, out, cv::COLORMAP_JET);
-	out.setTo(cv::Scalar(255, 255, 255), mask);
+	//out.setTo(cv::Scalar(255, 255, 255), mask);
 static void drawEdges(	const cv::Mat &in, cv::Mat &out,
@@ -467,7 +467,7 @@ const GLTexture &ftl::gui::Camera::captureFrame() {
 			case Channel::Smoothing:
 			case Channel::Confidence:
 				if (im2_.rows == 0) { break; }
-				visualizeEnergy(im2_, tmp, 1.0);
+				visualizeEnergy(im2_, tmp, screen_->root()->value("float_image_max", 1.0f));
diff --git a/applications/player/src/main.cpp b/applications/player/src/main.cpp
index 7eeb6e6bf5f96fd8f32360949c566c93860518db..33e146880668fb29224aaffc8942cb4255ebf0c1 100644
--- a/applications/player/src/main.cpp
+++ b/applications/player/src/main.cpp
@@ -7,6 +7,7 @@
 #include <ftl/timer.hpp>
 #include <fstream>
+#include <bitset>
 #include <Eigen/Eigen>
@@ -57,8 +58,23 @@ int main(int argc, char **argv) {
     int current_stream = 0;
     int current_channel = 0;
-	ftl::timer::add(ftl::timer::kTimerMain, [&current_stream,&current_channel,&r](int64_t ts) {
-		bool res = r.read(ts, [&current_stream,&current_channel,&r](const ftl::codecs::StreamPacket &spkt, const ftl::codecs::Packet &pkt) {
+	int stream_mask = 0;
+	std::vector<std::bitset<128>> channel_mask;
+	ftl::timer::add(ftl::timer::kTimerMain, [&current_stream,&current_channel,&r,&stream_mask,&channel_mask](int64_t ts) {
+		bool res = r.read(ts, [&current_stream,&current_channel,&r,&stream_mask,&channel_mask](const ftl::codecs::StreamPacket &spkt, const ftl::codecs::Packet &pkt) {
+			if (!(stream_mask & (1 << spkt.streamID))) {
+				stream_mask |= 1 << spkt.streamID;
+				LOG(INFO) << " - Stream found (" << (int)spkt.streamID << ")";
+				channel_mask.push_back(0);
+			}
+			if (!(channel_mask[spkt.streamID][(int)spkt.channel])) {
+				channel_mask[spkt.streamID].set((int)spkt.channel);
+				LOG(INFO) << " - Channel " << (int)spkt.channel << " found (" << (int)spkt.streamID << ")";
+			}
 			if (spkt.streamID == current_stream) {
 				if (pkt.codec == codec_t::POSE) {
diff --git a/components/operators/src/correspondence.cu b/components/operators/src/correspondence.cu
index 36e91d9f483d62164b284f43b82443641eadd664..1611a8483f8faf2bcb6b56640c3f7a07edc3fb22 100644
--- a/components/operators/src/correspondence.cu
+++ b/components/operators/src/correspondence.cu
@@ -1,6 +1,7 @@
 #include "mvmls_cuda.hpp"
 #include <ftl/cuda/weighting.hpp>
 #include <ftl/cuda/mask.hpp>
+#include <ftl/cuda/warp.hpp>
 using ftl::cuda::TextureObject;
 using ftl::rgbd::Camera;
@@ -8,6 +9,8 @@ using ftl::cuda::Mask;
 using ftl::cuda::MvMLSParams;
 #define T_PER_BLOCK 8
+#define WARP_SIZE 32
+#define INTERVAL 1
 template<int FUNCTION>
 __device__ float weightFunction(const ftl::cuda::MvMLSParams &params, float dweight, float cweight);
@@ -42,6 +45,45 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams &params,
 	return (cweight > 0.0f) ? dweight : 0.0f;
+#ifndef PINF
+#define PINF __int_as_float(0x7f800000)
+__device__ inline float distance(float4 p1, float4 p2) {
+	return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 10.0f);
+	//return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f);
+__device__ inline int halfWarpCensus(float e) {
+	float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16 : 0, WARP_SIZE);
+	int c = (e > e0) ? 1 << (threadIdx.x % 16) : 0;
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const int other = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE);
+		c |= other;
+	}
+	return c;
+__device__ inline float4 relativeDelta(const float4 &e) {
+	const float e0x = __shfl_sync(FULL_MASK, e.x, 0, WARP_SIZE/2);
+	const float e0y = __shfl_sync(FULL_MASK, e.y, 0, WARP_SIZE/2);
+	const float e0z = __shfl_sync(FULL_MASK, e.z, 0, WARP_SIZE/2);
+	return make_float4(e.x-e0x, e.y-e0y, e.z-e0z, 0.0f);
+ * See: Birchfield S. et al. (1998). A pixel dissimilarity measure that is
+ * insensitive to image sampling. IEEE Transactions on Pattern Analysis and
+ * Machine Intelligence.
+ */
+__device__ float dissimilarity(const float4 &l, const float4 &rp, const float4 &rc, const float4 &rn) {
+	const float rpd = distance((rc - rp) * 0.5f + rp, l);
+	const float rnd = distance((rc - rn) * 0.5f + rn, l);
+	const float rcd = distance(rc, l);
+	return min(min(rpd, rnd), rcd);
 template<int COR_STEPS, int FUNCTION> 
 __global__ void corresponding_point_kernel(
         TextureObject<float> d1,
@@ -51,16 +93,14 @@ __global__ void corresponding_point_kernel(
         TextureObject<short2> screenOut,
 		TextureObject<float> conf,
 		TextureObject<int> mask,
-        float4x4 pose1,
-        float4x4 pose1_inv,
-        float4x4 pose2,  // Inverse
+        float4x4 pose,
         Camera cam1,
         Camera cam2, ftl::cuda::MvMLSParams params) {
+	//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE;
+	const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y;
-    // Each warp picks point in p1
-    //const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
-	const int x = (blockIdx.x*blockDim.x + threadIdx.x); // / WARP_SIZE;
-    const int y = blockIdx.y*blockDim.y + threadIdx.y;
     if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) {
         screenOut(x,y) = make_short2(-1,-1);
@@ -73,83 +113,98 @@ __global__ void corresponding_point_kernel(
         //const float4 temp = vout.tex2D(x,y);
         //vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
-        const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
+        //const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
         const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
         //float bestdepth = 0.0f;
         short2 bestScreen = make_short2(-1,-1);
-		float bestdepth = 0.0f;
-		float bestdepth2 = 0.0f;
+		//float bestdepth = 0.0f;
+		//float bestdepth2 = 0.0f;
         float bestweight = 0.0f;
         float bestcolour = 0.0f;
-        float bestdweight = 0.0f;
+        //float bestdweight = 0.0f;
         float totalcolour = 0.0f;
-        int count = 0;
-        float contrib = 0.0f;
+        //int count = 0;
+        //float contrib = 0.0f;
+		const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1);
+        const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin);
+        const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 10.0f);
+        const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant);
+        const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x);
+		const float depthM = 10.0f / (lineDistant.x - lineOrigin.x);
+		const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x);
+        float2 linePos;
+        linePos.x = lineOrigin.x - ((COR_STEPS/2));
+        linePos.y = lineOrigin.y - (((COR_STEPS/2)) * lineM);
+		//float depthPos = depth1 - (float((COR_STEPS/2)) * depthM);
+        float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2);
-        const float step_interval = params.range / (COR_STEPS / 2);
-        const float3 rayStep_world = pose1.getFloat3x3() * cam1.screenToCam(x,y,step_interval);
-        const float3 rayStart_2 = pose2 * world1;
-        const float3 rayStep_2 = pose2.getFloat3x3() * rayStep_world;
+        uint badMask = 0;
+        int bestStep = 0;
         // Project to p2 using cam2
         // Each thread takes a possible correspondence and calculates a weighting
         //const int lane = tid % WARP_SIZE;
-        for (int i=0; i<COR_STEPS; ++i) {
-            const int j = i - (COR_STEPS/2);
-            const float depth_adjust = (float)j * step_interval;
-            // Calculate adjusted depth 3D point in camera 2 space
-            const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
-            const float3 camPos = rayStart_2 + j * rayStep_2; //pose2 * worldPos;
-			const float2 screen = cam2.camToScreen<float2>(camPos);
-			float weight = (screen.x >= cam2.width || screen.y >= cam2.height) ? 0.0f : 1.0f;
+        for (int i=0; i<COR_STEPS; ++i) {			
+			float weight = (linePos.x >= cam2.width || linePos.y >= cam2.height) ? 0.0f : 1.0f;
 			// Generate a colour correspondence value
-            const auto colour2 = c2.tex2D(screen.x, screen.y);
+            const auto colour2 = c2.tex2D(linePos.x, linePos.y);
+            // TODO: Check if other colour dissimilarities are better...
             const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
             // Generate a depth correspondence value
-			const float depth2 = d2.tex2D(int(screen.x+0.5f), int(screen.y+0.5f));
+            const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f));
+            // Record which correspondences are invalid
+            badMask |= (depth2 <= cam2.minDepth || depth2 >= cam2.maxDepth) ? 1 << i : 0;
-			if (FUNCTION == 1) {
-				weight *= ftl::cuda::weighting(fabs(depth2 - camPos.z), cweight*params.spatial_smooth);
-			} else {
-				const float dweight = ftl::cuda::weighting(fabs(depth2 - camPos.z), params.spatial_smooth);
-            	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-			}
+			//if (FUNCTION == 1) {
+				weight *= ftl::cuda::weighting(fabs(depth2 - depthPos2), cweight*params.spatial_smooth);
+			//} else {
+			//	const float dweight = ftl::cuda::weighting(fabs(depth2 - depthPos2), params.spatial_smooth);
+            //	weight *= weightFunction<FUNCTION>(params, dweight, cweight);
+			//}
             //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
             //weight *= weightFunction<FUNCTION>(params, dweight, cweight);
-            ++count;
-            contrib += weight;
+            //++count;
             bestcolour = max(cweight, bestcolour);
-            //bestdweight = max(dweight, bestdweight);
             totalcolour += cweight;
-			bestdepth = (weight > bestweight) ? depth_adjust : bestdepth;
-			//bestdepth2 = (weight > bestweight) ? camPos.z : bestdepth2;
-			//bestScreen = (weight > bestweight) ? make_short2(screen.x+0.5f, screen.y+0.5f) : bestScreen;
+            //bestdepth = (weight > bestweight) ? depthPos : bestdepth;
+            bestStep = (weight > bestweight) ? i : bestStep;
 			bestweight = max(bestweight, weight);
-                //bestweight = weight;
-                //bestdepth = depth_adjust;
-                //bestScreen = make_short2(screen.x+0.5f, screen.y+0.5f);
-            //}
+			//depthPos += depthM;
+			depthPos2 += depthM2;
+            linePos.x += 1.0f;
+            linePos.y += lineM;
-        const float avgcolour = totalcolour/(float)count;
+        //const float avgcolour = totalcolour/(float)count;
         const float confidence = bestcolour / totalcolour; //bestcolour - avgcolour;
+        const float bestadjust = float(bestStep-(COR_STEPS/2))*depthM;
+        // Detect matches to boundaries, and discard those
+        uint stepMask = 1 << bestStep;
+        if ((stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestweight = 0.0f;
         //Mask m(mask.tex2D(x,y));
         //if (bestweight > 0.0f) {
             float old = conf.tex2D(x,y);
             if (bestweight * confidence > old) {
-				d1(x,y) = 0.4f*bestdepth + depth1;
+				d1(x,y) = (0.4f*bestadjust) + depth1;
 				//d2(bestScreen.x, bestScreen.y) = bestdepth2;
                 //screenOut(x,y) = bestScreen;
                 conf(x,y) = bestweight * confidence;
@@ -169,26 +224,35 @@ void ftl::cuda::correspondence(
         TextureObject<short2> &screen,
 		TextureObject<float> &conf,
 		TextureObject<int> &mask,
-        float4x4 &pose1,
-        float4x4 &pose1_inv,
         float4x4 &pose2,
         const Camera &cam1,
         const Camera &cam2, const MvMLSParams &params, int func,
         cudaStream_t stream) {
-	const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(WARP_SIZE, 2);
     //printf("COR SIZE %d,%d\n", p1.width(), p1.height());
-	switch (func) {
-    case 0: corresponding_point_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	case 1: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	case 2: corresponding_point_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	case 3: corresponding_point_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	case 4: corresponding_point_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	case 5: corresponding_point_kernel<16,5><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
-	}
+	/*switch (func) {
+    case 0: corresponding_point_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 1: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 2: corresponding_point_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 3: corresponding_point_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 4: corresponding_point_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+	case 5: corresponding_point_kernel<16,5><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    }*/
+    switch (func) {
+    case 32: corresponding_point_kernel<32,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 16: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 8: corresponding_point_kernel<8,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 4: corresponding_point_kernel<4,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    case 2: corresponding_point_kernel<2,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break;
+    }
     cudaSafeCall( cudaGetLastError() );
diff --git a/components/operators/src/mvmls.cpp b/components/operators/src/mvmls.cpp
index e85f8271149537c920b838e9885c3a406bbb5b5b..4a61e63223bc14e07b9dd90695a6600b4be99b7e 100644
--- a/components/operators/src/mvmls.cpp
+++ b/components/operators/src/mvmls.cpp
@@ -24,7 +24,8 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 	int iters = config()->value("mls_iterations", 3);
 	int radius = config()->value("mls_radius",5);
 	//bool aggre = config()->value("aggregation", true);
-    int win = config()->value("cost_function",1);
+    //int win = config()->value("cost_function",1);
+    int win = config()->value("window_size",16);
     bool do_corr = config()->value("merge_corresponding", true);
 	bool do_aggr = config()->value("merge_mls", false);
 	bool cull_zero = config()->value("cull_no_confidence", false);
@@ -103,12 +104,9 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
                     // No, so skip this combination
                     if (d1.dot(d2) <= 0.0) continue;
-                    auto pose1 = MatrixConversion::toCUDA(s1->getPose().cast<float>());
-                    auto pose1_inv = MatrixConversion::toCUDA(s1->getPose().cast<float>().inverse());
-                    auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse());
-					auto pose2_inv = MatrixConversion::toCUDA(s2->getPose().cast<float>());
+                    auto pose2 = MatrixConversion::toCUDA(s2->getPose().cast<float>().inverse() * s1->getPose().cast<float>());
-                    auto transform = pose2 * pose1;
+                    //auto transform = pose2 * pose1;
                     //Calculate screen positions of estimated corresponding points
@@ -120,8 +118,6 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
-                        pose1,
-                        pose1_inv,
@@ -144,6 +140,9 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
+            // Reduce window size for next iteration
+            win = max(win>>1, 4);
         // Find best source for every pixel
@@ -250,7 +249,7 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
             for (size_t i=0; i<in.frames.size(); ++i) {
                 auto &f1 = in.frames[i];
                 //f1.get<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0.0f), cvstream);
-                f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
+                //f1.get<GpuMat>(Channel::Confidence).setTo(cv::Scalar(0.0f), cvstream);
                 Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0);
                 d1 = in.sources[i]->getPose() * d1;
diff --git a/components/operators/src/mvmls_cuda.hpp b/components/operators/src/mvmls_cuda.hpp
index 93b1e8d882848490aec96a291d58805c2d2dbf03..5faeb47533a16e7c3e13bfc5e23e826aa8b836e3 100644
--- a/components/operators/src/mvmls_cuda.hpp
+++ b/components/operators/src/mvmls_cuda.hpp
@@ -28,9 +28,7 @@ void correspondence(
         ftl::cuda::TextureObject<short2> &screen,
 		ftl::cuda::TextureObject<float> &conf,
 		ftl::cuda::TextureObject<int> &mask,
-        float4x4 &pose1,
-        float4x4 &pose1_inv,
-        float4x4 &pose2,
+        float4x4 &pose,
         const ftl::rgbd::Camera &cam1,
         const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams &params, int func,
         cudaStream_t stream);
diff --git a/components/renderers/cpp/include/ftl/cuda/warp.hpp b/components/renderers/cpp/include/ftl/cuda/warp.hpp
index 9164b0eeeb8b3ef606aef4930f55b38a1afacdc4..8c0fdef9802c1ffb86e226c803d806ea56ea003e 100644
--- a/components/renderers/cpp/include/ftl/cuda/warp.hpp
+++ b/components/renderers/cpp/include/ftl/cuda/warp.hpp
@@ -42,6 +42,41 @@ __device__ inline int warpSum(int e) {
 	return e;
+// Half warp
+__device__ inline float halfWarpMin(float e) {
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = min(e, other);
+	}
+	return e;
+__device__ inline float halfWarpMax(float e) {
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = max(e, other);
+	}
+	return e;
+__device__ inline float halfWarpSum(float e) {
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e += other;
+	}
+	return e;
+__device__ inline int halfWarpSum(int e) {
+	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e += other;
+	}
+	return e;
diff --git a/components/renderers/cpp/src/reprojection.cu b/components/renderers/cpp/src/reprojection.cu
index e6490c47eaf9033c2077cdd06457b3ff4f59103e..b23ad81e9c462f9d4aeb2b6a83d53331487b1db5 100644
--- a/components/renderers/cpp/src/reprojection.cu
+++ b/components/renderers/cpp/src/reprojection.cu
@@ -89,7 +89,9 @@ __global__ void reprojection_kernel(
 	const float3 n = transformR * make_float3(normals.tex2D((int)x, (int)y));
 	float3 ray = camera.screenToCam(screenPos.x, screenPos.y, 1.0f);
 	ray = ray / length(ray);
-	const float dotproduct = max(dot(ray,n),0.0f);
+	// Allow slightly beyond 90 degrees due to normal estimation errors
+	const float dotproduct = (max(dot(ray,n),-0.1f)+0.1) / 1.1f;
 	const float d2 = depth_src.tex2D(int(screenPos.x+0.5f), int(screenPos.y+0.5f));
 	const auto input = in.tex2D(screenPos.x, screenPos.y); //generateInput(in.tex2D((int)screenPos.x, (int)screenPos.y), params, worldPos);
diff --git a/components/renderers/cpp/src/splatter.cu b/components/renderers/cpp/src/splatter.cu
index 2986234bb3bbc24f762ff5ba0103ba173f4a0093..55706b0856750738134a3417dfdb017205ab2bdf 100644
--- a/components/renderers/cpp/src/splatter.cu
+++ b/components/renderers/cpp/src/splatter.cu
@@ -131,7 +131,7 @@ using ftl::cuda::warpSum;
 	const uint2 screenPos = params.camera.camToScreen<uint2>(camPos);
 	const unsigned int cx = screenPos.x;
 	const unsigned int cy = screenPos.y;
-	if (d > params.camera.minDepth && d < params.camera.maxDepth && cx < depth.width() && cy < depth.height()) {
+	if (d > params.camera.minDepth && d < params.camera.maxDepth && cx < depth_out.width() && cy < depth_out.height()) {
 		// Transform estimated point to virtual cam space and output z
 		atomicMin(&depth_out(cx,cy), d * 100000.0f);
@@ -155,7 +155,7 @@ void ftl::cuda::dibr_merge(TextureObject<float4> &points, TextureObject<int> &de
 void ftl::cuda::dibr_merge(TextureObject<float> &depth, TextureObject<int> &depth_out, const float4x4 &transform, const ftl::rgbd::Camera &cam, SplatParams params, cudaStream_t stream) {
-    const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 	dibr_merge_kernel<<<gridSize, blockSize, 0, stream>>>(depth, depth_out, transform, cam, params);
diff --git a/components/renderers/cpp/src/tri_render.cpp b/components/renderers/cpp/src/tri_render.cpp
index 567a9d139e45678c4d64534e481a8d5c107e3ad5..06d4fe2626f01e989645e19e795ef4df925c96c2 100644
--- a/components/renderers/cpp/src/tri_render.cpp
+++ b/components/renderers/cpp/src/tri_render.cpp
@@ -621,7 +621,7 @@ bool Triangular::render(ftl::rgbd::VirtualSource *src, ftl::rgbd::Frame &out, co
-	if (value("show_bad_colour", false)) {
+	if (value("show_bad_colour", true)) {