diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt
index ae06aab6d142ba8a2ad3a1e0970905e3a70468e9..1e55c671c87487b995b0dd772495ff8a612a8c78 100644
--- a/applications/reconstruct/CMakeLists.txt
+++ b/applications/reconstruct/CMakeLists.txt
@@ -10,7 +10,7 @@ set(REPSRC
 	src/garbage.cu
 	src/integrators.cu
 	#src/ray_cast_sdf.cu
-	src/splat_render.cu
+	src/voxel_render.cu
 	src/camera_util.cu
 	src/voxel_hash.cu
 	src/voxel_hash.cpp
@@ -19,6 +19,7 @@ set(REPSRC
 	#src/virtual_source.cpp
 	src/splat_render.cpp
 	src/dibr.cu
+	src/mls.cu
 	src/depth_camera.cu
 	src/depth_camera.cpp
 )
diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp
index b7a3b44426bf6fcf490af67f38461ca849ade428..ae85cfbd785fb72f79225ae28278bf4862b680db 100644
--- a/applications/reconstruct/include/ftl/cuda_operators.hpp
+++ b/applications/reconstruct/include/ftl/cuda_operators.hpp
@@ -406,6 +406,12 @@ inline __host__ __device__ float length(float3 v)
     return sqrtf(dot(v, v));
 }
 
+// length squared
+inline __host__ __device__ float length2(const float3 &v)
+{
+    return dot(v, v);
+}
+
 // normalize
 inline __host__ __device__ float3 normalize(float3 v)
 {
diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu
index 9a36ea452225c1a174d0ba6ced0baf98471688eb..7a322eaf733230772e72c50722e849335bf7e891 100644
--- a/applications/reconstruct/src/depth_camera.cu
+++ b/applications/reconstruct/src/depth_camera.cu
@@ -47,6 +47,21 @@ void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStre
 	clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_to_zero_kernel(ftl::cuda::TextureObject<float> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = 0.0f; //PINF;
+	}
+}
+
+void ftl::cuda::clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_to_zero_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 __global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -79,6 +94,21 @@ void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cuda
 	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_colour_kernel(ftl::cuda::TextureObject<float4> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = make_float4(0.0f);
+	}
+}
+
+void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 // ===== Type convert =====
 
 template <typename A, typename B>
@@ -103,331 +133,6 @@ void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda:
 	convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale);
 }
 
-/// ===== MLS Smooth
-
-// TODO:(Nick) Put this in a common location (used in integrators.cu)
-extern __device__ float spatialWeighting(float r);
-extern __device__ float spatialWeighting(float r, float h);
-
-/*
- * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
- * Real-time foreground-background segmentation using codebook model.
- * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
- */
- __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
-	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
-	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
-	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
-	const float p_2 = xv_2 / v_2;
-	return sqrt(x_2 - p_2);
-}
-
-__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
-	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
-	return length(delta);
-}
-
-/*
- * Colour weighting as suggested in:
- * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
- * c = colour distance
- */
- __device__ float colourWeighting(float c) {
-	const float h = c_hashParams.m_colourSmoothing;
-	if (c >= h) return 0.0f;
-	float ch = c / h;
-	ch = 1.0f - ch*ch;
-	return ch*ch*ch*ch;
-}
-
-#define WINDOW_RADIUS 5
-
-__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > 0.0f) {
-						wpos += weight* (camera.pose * camPos);
-						weights += weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-
-				// TODO:(Nick) dot product of normals < 0 means the point
-				// should be ignored with a weight of 0 since it is facing the wrong direction
-				// May be good to simply weight using the dot product to give
-				// a stronger weight to those whose normals are closer
-
-				float weight = spatialWeighting(length(pf - camPos), h);
-
-				if (weight > 0.0f) {
-					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
-					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
-
-						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-
-						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
-							pos += weight*camPos; // (camera.pose * camPos);
-							weights += weight;
-						}
-					}
-				}			
-			//}
-		}
-	}
-
-	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
-
-	return weights;
-}
-
-__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > weights) {
-						pos = weight* (camera.pose * camPos);
-						weights = weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	wpos += pos;
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-
-	//float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y);
-	const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y));
-	if (worldPos.z == MINF) return 0.0f;
-
-	float weight = spatialWeighting(length(mPos - worldPos));
-
-	if (weight > 0.0f) {
-		wpos += weight* (worldPos);
-		weights += weight;
-	}
-
-	return weights;
-}
-
-__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = output.width();
-	const int height = output.height();
-
-	const DepthCameraCUDA &mainCamera = c_cameras[cam];
-
-	if (x < width && y < height) {
-
-		const float depth = tex2D<float>(mainCamera.depth, x, y);
-		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
-		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
-		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
-
-		float3 wpos = make_float3(0.0f);
-		float3 wnorm = make_float3(0.0f);
-		float weights = 0.0f;
-
-		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
-			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
-
-			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
-					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
-					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
-
-				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
-					for (uint cam2=0; cam2<numcams; ++cam2) {
-						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
-						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
-
-						// Previous approach
-						//if (cam2 == cam) continue;
-						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
-					}
-					wpos /= weights;
-				} else {
-					weights = 1000.0f;
-					wpos = mPos;
-				} 
-
-				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-
-				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
-
-				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
-				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
-				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-				//}
-			}
-		}
-	}
-}
-
-void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
-	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-#define RESAMPLE_RADIUS 7
-
-__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = depthin.width();
-	const int height = depthin.height();
-
-	if (x < width && y < height) {
-
-		//const int depth = depthin.tex2D((int)x, (int)y);
-		//if (depth != 0x7FFFFFFF) {
-		//	depthout(x,y) = (float)depth / 1000.0f;
-		//	return;
-		//}
-
-		struct map_t {
-			int d;
-			int quad;
-		};
-
-		map_t mappings[5];
-		int mapidx = 0;
-
-		for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) {
-			for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) {
-
-				const int depth = depthin.tex2D((int)x+u, (int)y+v);
-				const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v);
-				
-				if (depth != 0x7FFFFFFF) {
-					int i=0;
-					for (i=0; i<mapidx; ++i) {
-						if (abs(mappings[i].d - depth) < 100) {
-							if (u < 0 && v < 0) mappings[i].quad |= 0x1;
-							if (u > 0 && v < 0) mappings[i].quad |= 0x2;
-							if (u > 0 && v > 0) mappings[i].quad |= 0x4;
-							if (u < 0 && v > 0) mappings[i].quad |= 0x8;
-							break;
-						}
-					}
-					if (i == mapidx && i < 5) {
-						mappings[mapidx].d = depth;
-						mappings[mapidx].quad = 0;
-						if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1;
-						if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2;
-						if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4;
-						if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8;
-						++mapidx;
-					} else {
-						//printf("EXCEEDED\n");
-					}
-				}
-			}
-		}
-
-		int bestdepth = 1000000;
-		//int count = 0;
-		for (int i=0; i<mapidx; ++i) {
-			if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15) count ++;
-		}
-
-		//depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f;
-
-		if (bestdepth < 1000000) {
-			depthout(x,y) = (float)bestdepth / 1000.0f;
-		}
-	}
-}
-
-void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams &params, cudaStream_t stream) {
-	const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-
 /// ===== Median Filter ======
 
 #define WINDOW_SIZE 3
diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp
index 552c2e59d44206c089fee68124ca814d3e752ad6..26bfcad73f5ae72f20cfe2dd29d0e91c62fca62b 100644
--- a/applications/reconstruct/src/depth_camera_cuda.hpp
+++ b/applications/reconstruct/src/depth_camera_cuda.hpp
@@ -10,8 +10,10 @@ namespace cuda {
 
 void clear_depth(const TextureObject<float> &depth, cudaStream_t stream);
 void clear_depth(const TextureObject<int> &depth, cudaStream_t stream);
+void clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream);
 void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream);
+void clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 
 void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream);
 
@@ -21,7 +23,7 @@ void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureO
 
 void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream);
 
-void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams &params, cudaStream_t stream);
+void mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const ftl::render::SplatParams &params, int numcams, cudaStream_t stream);
 
 void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams &params, cudaStream_t stream);
 
diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu
index 9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f..acb86216a7eecd6e10f3ebf3a3b999964b56cf09 100644
--- a/applications/reconstruct/src/dibr.cu
+++ b/applications/reconstruct/src/dibr.cu
@@ -1,12 +1,20 @@
 #include "splat_render_cuda.hpp"
+#include "depth_camera_cuda.hpp"
 #include <cuda_runtime.h>
 
 #include <ftl/cuda_matrix_util.hpp>
 
 #include "splat_params.hpp"
+#include "mls_cuda.hpp"
 #include <ftl/depth_camera.hpp>
 
 #define T_PER_BLOCK 8
+#define UPSAMPLE_FACTOR 1.8f
+#define WARP_SIZE 32
+#define DEPTH_THRESHOLD 0.05f
+#define UPSAMPLE_MAX 60
+#define MAX_ITERATIONS 10
+#define SPATIAL_SMOOTHING 0.01f
 
 using ftl::cuda::TextureObject;
 using ftl::render::SplatParams;
@@ -23,55 +31,343 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) {
 	}
 }
 
+__device__ inline bool isStable(const float3 &previous, const float3 &estimate, const SplatParams &params, float d) {
+    const float psize = 2.0f * d / params.camera.fx;
+    //printf("PSIZE %f\n", psize);
+    return fabs(previous.x - estimate.x) <= psize &&
+        fabs(previous.y - estimate.y) <= psize &&
+        fabs(previous.z - estimate.z) <= psize;
+}
 
-__global__ void dibr_depthmap_kernel(
-    TextureObject<int> depth, int numcams, SplatParams params) {
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+__global__ void dibr_visibility_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((5.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+    // TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-    const int y = by + threadIdx.y;
-    
-    for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		//float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,camPos.z);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+        float ld = nearest.z;
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+			float3 output;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+                if (ftl::cuda::mls_point_surface<1>(camera.points, make_int2(x,y), params.m_viewMatrixInverse * nearest, output, SPATIAL_SMOOTHING) <= 0.0f) {
+                    /*const unsigned int cx = screenPos.x;
+                    const unsigned int cy = screenPos.y;
+                    if (cx < depth.width() && cy < depth.height()) {
+                        atomicMax(&depth(cx,cy), 10000.0f);
+                    }*/
+                    break;
+                }
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+				output = params.m_viewMatrix * output;
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z + copysignf(0.5f*length(output - nearest), output.z - nearest.z);
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);
+
+                const float2 sp = params.camera.cameraToKinectScreenFloat(output);
+
+                //if (isStable(nearest, output, params, d)) {
+                //if (fabs(sp.x - float(screenPos.x+u)) < 2.0f && fabs(sp.y - float(screenPos.y+v)) < 2.0f) {
+				if (length(output - nearest) <= 2.0f * params.camera.fx / camPos.z) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+                    }
+                    break;
+                }
+
+                /*if (k >= MAX_ITERATIONS-1 && length(output - nearest) <= SPATIAL_SMOOTHING) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+						//atomicMin(&depth(cx,cy), d * 1000.0f);
+						printf("ERR = %f, %f\n", fabs(sp.x - float(screenPos.x+u)), fabs(sp.y - float(screenPos.y+v)));
+                    }
+                }*/
+
+                //nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);  // ld + (d - ld)*0.8f
+                ld = d;
+			}
+		//}
+	}
+}
+
+// ------ Alternative for pass 1: principle surfaces ---------------------------
+
+#define NEIGHBOR_RADIUS 1
+#define MAX_NEIGHBORS ((NEIGHBOR_RADIUS*2+1)*(NEIGHBOR_RADIUS*2+1))
+
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+ __global__ void dibr_visibility_principal_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	__shared__ float3 neighborhood_cache[2*T_PER_BLOCK][MAX_NEIGHBORS];
+	__shared__ int minimum[2*T_PER_BLOCK];
+	__shared__ int maximum[2*T_PER_BLOCK];
+
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2;
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+	// TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
-        
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
+	// TODO:(Nick) Preload point neighbors and transform to eye
+	const int lane = threadIdx.x % WARP_SIZE;
+	if (lane == 0) {
+		minimum[warp] = 100000000;
+		maximum[warp] = -100000000;
+	}
+
+	__syncwarp();
+
+	for (int i=lane; i<MAX_NEIGHBORS; i+=WARP_SIZE) {
+		const int u = (i % (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const int v = (i / (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const float3 point = params.m_viewMatrix * make_float3(tex2D<float4>(camera.points, x+u, y+v));
+		neighborhood_cache[warp][i] = point;
+
+		if (length(point - camPos) <= 0.04f) {
+			atomicMin(&minimum[warp], point.z*1000.0f);
+			atomicMax(&maximum[warp], point.z*1000.0f);
+		}
+	}
+
+	__syncwarp();
+	
+	const float interval = (float(maximum[warp])/1000.0f - float(minimum[warp]) / 1000.0f) / float(MAX_ITERATIONS);
+	//if (y == 200) printf("interval: %f\n", interval);
+
+	// TODO:(Nick) Find min and max depths of neighbors to estimate z bounds
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        //float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		// TODO: Use min point as estimate
+		float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,float(minimum[warp])/1000.0f);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+				const float energy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS>(neighborhood_cache[warp], nearest, SPATIAL_SMOOTHING);
+				
+				if (energy <= 0.0f) break;
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z;
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d+interval);
+				
+				if (energy >= 0.1f) {
+					const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+					if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+					}
+					break;
+				}
+			}
+		//}
+	}
+}
+
+__device__ inline float4 make_float4(const uchar4 &c) {
+    return make_float4(c.x,c.y,c.z,c.w);
+}
+
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+__global__ void dibr_attribute_contrib_kernel(
+        TextureObject<int> depth_in,
+        TextureObject<float4> colour_out,
+        TextureObject<float4> normal_out,
+        TextureObject<float> contrib_out, int cam,
+        SplatParams params) {
         
-        const float3 worldPos = make_float3(d);
-        const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
 
+    const int upsample = min(UPSAMPLE_MAX, int((10.0f*r) * params.camera.fx / camPos.z));
 
-        if (cx < depth.width() && cy < depth.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-           		//colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-            //}
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth_in.width() || screenPos.y - upsample >= depth_in.height()) return;
             
-            atomicMin(&depth(cx,cy), camPos.z * 1000.0f);
+    // Is this point near the actual surface and therefore a contributor?
+    const float d = ((float)depth_in.tex2D((int)screenPos.x, (int)screenPos.y)/1000.0f);
+    if (abs(d - camPos.z) > DEPTH_THRESHOLD) return;
+
+    // TODO:(Nick) Should just one thread load these to shared mem?
+    const float4 colour = make_float4(tex2D<uchar4>(camera.colour, x, y));
+    const float4 normal = tex2D<float4>(camera.normal, x, y);
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Use the depth buffer to determine this pixels 3D position in camera space
+        const float d = ((float)depth_in.tex2D(screenPos.x+u, screenPos.y+v)/1000.0f);
+		const float3 nearest = params.camera.kinectDepthToSkeleton((int)(screenPos.x+u),(int)(screenPos.y+v),d);
+
+        // What is contribution of our current point at this pixel?
+        const float weight = ftl::cuda::spatialWeighting(length(nearest - camPos), SPATIAL_SMOOTHING);
+        if (screenPos.x+u < colour_out.width() && screenPos.y+v < colour_out.height() && weight > 0.0f) {  // TODO: Use confidence threshold here
+            const float4 wcolour = colour * weight;
+            const float4 wnormal = normal * weight;
+
+            // Add this points contribution to the pixel buffer
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v), wcolour.x);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+1, wcolour.y);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+2, wcolour.z);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+3, wcolour.w);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v), wnormal.x);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+1, wnormal.y);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+2, wnormal.z);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+3, wnormal.w);
+            atomicAdd(&contrib_out(screenPos.x+u, screenPos.y+v), weight);
         }
-    }
+	}
 }
 
-
-__global__ void dibr_kernel(
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+/*__global__ void dibr_attribute_contrib_kernel(
     TextureObject<int> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
+	TextureObject<uchar4> colour_out,
+	TextureObject<float4> normal_out, int numcams, SplatParams params) {
 
     const int i = threadIdx.y*blockDim.y + threadIdx.x;
     const int bx = blockIdx.x*blockDim.x;
@@ -80,119 +376,89 @@ __global__ void dibr_kernel(
     const int y = by + threadIdx.y;
     
     for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+        const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[j];
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
+        float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+        float r = (camera.poseInverse * worldPos).z;
+        //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), worldPos, 0.02f) < 0.001f) continue;
+        if (worldPos.x == MINF) continue;
         
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
-        
-        const float3 worldPos = make_float3(d);
         const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
-
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+        // Estimate upsample factor using ratio of source depth and output depth
 
+		const int upsample = min(15, (int)(UPSAMPLE_FACTOR * (r / camPos.z))+1);
+		const float upfactor = 2.0f / (float)(upsample);
 
-        if (cx < colour_out.width() && cy < colour_out.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-
-            if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) {
-				   colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-				   //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255);
+        for (int v=0; v<upsample; ++v) {
+            for (int u=0; u<upsample; ++u) {
+                float3 point;
+                const ftl::cuda::fragment nearest = ftl::cuda::upsampled_point(camera.points, camera.normal, camera.colour,
+                    make_float2((float)x-1.0f+u*upfactor,(float)y-1.0f+v*upfactor));
+                //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), nearest, point, 0.02f) < 0.001f) continue;
+                ftl::cuda::render_fragment(depth_in, normal_out, colour_out, params, nearest);
             }
-                   
-
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
         }
     }
-}
-
-__device__ inline float4 make_float4(const uchar4 &c) {
-    return make_float4(c.x,c.y,c.z,c.w);
-}
-
-__global__ void dibr_kernel_rev(
-    TextureObject<float> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
-
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-	
-	float camd = depth_in.tex2D((int)x,(int)y);
-	if (camd < 0.01f)	return;
-	if (camd >= params.camera.m_sensorDepthWorldMax) return;
-	
-	const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd);
-    float mindiff = 1000.0f;
-    float4 col = make_float4(0.0f,0.0f,0.0f,0.0f);
-    int count = 0;
+}*/
 
-    for (int j=0; j<numcams; ++j) {
-		const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
-		
-		const float3 camPos = camera.poseInverse * worldPos;
-	    const float2 screenPosf = camera.params.cameraToKinectScreenFloat(camPos);
-		const uint2 screenPos = make_uint2(make_int2(screenPosf));
-		
-		if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-		const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
 
-        if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) {
-			float d = tex2D<float>(camera.depth, (int)cx, (int)cy);
-            float camdiff = fabs(camPos.z-d);
-            
-            if (camdiff < mindiff) {
-                mindiff = camdiff;
-                col += make_float4(tex2D<uchar4>(camera.colour,cx,cy));
-                ++count;
-            }
+__global__ void dibr_normalise_kernel(
+        TextureObject<float4> colour_in,
+        TextureObject<uchar4> colour_out,
+        TextureObject<float4> normals,
+        TextureObject<float> contribs) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-            //if (camdiff < 0.1f) {
-            //	colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy);
-            //} else {
-				//colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
-		}
-    }
+	if (x < colour_in.width() && y < colour_in.height()) {
+        const float4 colour = colour_in.tex2D((int)x,(int)y);
+        const float4 normal = normals.tex2D((int)x,(int)y);
+        const float contrib = contribs.tex2D((int)x,(int)y);
 
-    if (count > 0) {
-        col = col / (float)count;
-        colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255);
-    } else {
-        colour_out(x,y) = make_uchar4(76,76,76,255);
-    }
+        if (contrib > 0.0f) {
+            colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0);
+            normals(x,y) = normal / contrib;
+        }
+	}
 }
 
 void ftl::cuda::dibr(const TextureObject<int> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
-
+        const TextureObject<uchar4> &colour_out,
+        const TextureObject<float4> &normal_out,
+        const TextureObject<float> &confidence_out,
+        const TextureObject<float4> &tmp_colour,
+        int numcams,
+        const SplatParams &params,
+        cudaStream_t stream) {
+
+	const dim3 sgridSize((depth_out.width() + 2 - 1)/2, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 sblockSize(2*WARP_SIZE, T_PER_BLOCK);
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    ftl::cuda::clear_to_zero(confidence_out, stream);
+    ftl::cuda::clear_colour(tmp_colour, stream);
+    ftl::cuda::clear_colour(normal_out, stream);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
-    dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+    int i=3;
+    // Pass 1, merge a depth map from each camera.
+	for (int i=0; i<numcams; ++i)
+        dibr_visibility_principal_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, i, params);
+
+    // Pass 2, accumulate all point contributions to pixels
+    for (int i=0; i<numcams; ++i)
+        dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, tmp_colour, normal_out, confidence_out, i, params);
+
+    // Pass 3, normalise contributions
+    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(tmp_colour, colour_out, normal_out, confidence_out);
+
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
@@ -200,22 +466,21 @@ void ftl::cuda::dibr(const TextureObject<int> &depth_out,
 #endif
 }
 
-void ftl::cuda::dibr(const TextureObject<float> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
+void ftl::cuda::dibr_raw(const TextureObject<int> &depth_out,
+    	int numcams, const SplatParams &params, cudaStream_t stream) {
 
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+	//dibr_depthmap_direct_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 }
+
diff --git a/applications/reconstruct/src/mls.cu b/applications/reconstruct/src/mls.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71201a017cf1ebab6e06257e9425403df50427d3
--- /dev/null
+++ b/applications/reconstruct/src/mls.cu
@@ -0,0 +1,273 @@
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include "depth_camera_cuda.hpp"
+
+#include "mls_cuda.hpp"
+
+#define T_PER_BLOCK 16
+#define MINF __int_as_float(0xff800000)
+
+using ftl::voxhash::DepthCameraCUDA;
+using ftl::voxhash::HashData;
+using ftl::voxhash::HashParams;
+using ftl::cuda::TextureObject;
+using ftl::render::SplatParams;
+
+extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+extern __constant__ HashParams c_hashParams;
+
+/// ===== MLS Smooth
+
+/*
+ * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
+ * Real-time foreground-background segmentation using codebook model.
+ * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
+ */
+ __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
+	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
+	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
+	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
+	const float p_2 = xv_2 / v_2;
+	return sqrt(x_2 - p_2);
+}
+
+__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
+	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
+	return length(delta);
+}
+
+/*
+ * Colour weighting as suggested in:
+ * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
+ * c = colour distance
+ */
+ __device__ float colourWeighting(float c) {
+	const float h = c_hashParams.m_colourSmoothing;
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
+#define WINDOW_RADIUS 5
+
+__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), c_hashParams.m_spatialSmoothing);
+
+				if (weight > 0.0f) {
+					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+					weight *= colourWeighting(colordiffFloat2(c1,c2));
+
+					if (weight > 0.0f) {
+						wpos += weight* (camera.pose * camPos);
+						weights += weight;
+					}
+				}			
+			//}
+		}
+	}
+
+	//wpos += (camera.pose * pos);
+
+	return weights;
+}
+
+__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+
+				// TODO:(Nick) dot product of normals < 0 means the point
+				// should be ignored with a weight of 0 since it is facing the wrong direction
+				// May be good to simply weight using the dot product to give
+				// a stronger weight to those whose normals are closer
+
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), h);
+
+				if (weight > 0.0f) {
+					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
+					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
+
+						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+
+						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
+							pos += weight*camPos; // (camera.pose * camPos);
+							weights += weight;
+						}
+					}
+				}			
+			//}
+		}
+	}
+
+	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
+
+	return weights;
+}
+
+
+__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	const DepthCameraCUDA &mainCamera = c_cameras[cam];
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
+				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				//}
+			}
+		}
+	}
+}
+
+void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+
+// ===== Render Depth using MLS ================================================
+
+#define MAX_UPSAMPLE 5
+#define SAMPLE_BUFFER ((2*MAX_UPSAMPLE+1)*(2*MAX_UPSAMPLE+1))
+#define WARP_SIZE 32
+#define BLOCK_WIDTH 4
+#define MLS_RADIUS 5
+#define MLS_WIDTH (2*MLS_RADIUS+1)
+#define MLS_SAMPLES (MLS_WIDTH*MLS_WIDTH)
+
+__global__ void mls_render_depth_kernel(const TextureObject<int> input, TextureObject<int> output, SplatParams params, int numcams) {
+	/*const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				//if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				if (screenPos.x < output.width() && screenPos.y < output.height()) {
+					output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				}
+			}
+		}
+	}*/
+}
+
+
+void ftl::cuda::mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const SplatParams &params, int numcams, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_render_depth_kernel<<<gridSize, blockSize, 0, stream>>>(input, output, params, numcams);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
diff --git a/applications/reconstruct/src/mls_cuda.hpp b/applications/reconstruct/src/mls_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..420b3beb0a5603abbc1a34e5cfeba5b99912408d
--- /dev/null
+++ b/applications/reconstruct/src/mls_cuda.hpp
@@ -0,0 +1,377 @@
+#ifndef _FTL_MLS_CUDA_HPP_
+#define _FTL_MLS_CUDA_HPP_
+
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include "splat_params.hpp"
+
+__device__ inline float3 make_float3(const uchar4 &c) {
+	return make_float3((float)c.x,(float)c.y,(float)c.z);
+}
+
+__device__ inline uchar4 make_uchar4(const float3 &c) {
+	return make_uchar4(c.x,c.y,c.z,255);
+}
+
+namespace ftl {
+namespace cuda {
+
+/*
+ * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007.
+ * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras
+ *     r = distance between points
+ *     h = smoothing parameter in meters (default 4cm)
+ */
+__device__ inline float spatialWeighting(float r, float h) {
+	if (r >= h) return 0.0f;
+	float rh = r / h;
+	rh = 1.0f - rh*rh;
+	return rh*rh*rh*rh;
+}
+
+__device__ float colourWeighting(float c);
+
+struct fragment {
+	float3 point;
+	float3 normal;
+	uchar4 colour;
+};
+
+__device__ inline float3 upsampled_point(cudaTextureObject_t pointset, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+
+	// R is the centroid of surrounding points.
+	R /= (D1+D2+D3+D4);
+
+	// FIXME: Should not use centroid but instead sample the surface at this point
+	// Use plane estimate at point to get "centroid" and then do the spatial weighted sample?
+	return R;
+}
+
+__device__ inline fragment upsampled_point(cudaTextureObject_t pointset,
+		cudaTextureObject_t normals, cudaTextureObject_t colours, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+	float3 C = make_float3(0.0f, 0.0f, 0.0f);
+
+	// TODO:(Nick) Don't upsample points if distance is too great
+
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+	N += D1 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y)));
+	C += D1 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y)));
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+	N += D2 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y+1.0f)));
+	C += D2 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y+1.0f)));
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+	N += D3 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y)));
+	C += D3 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y)));
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+	N += D4 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y+1.0f)));
+	C += D4 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y+1.0f)));
+
+	return {R / (D1+D2+D3+D4), N / (D1+D2+D3+D4), make_uchar4(C / (D1+D2+D3+D4))};
+}
+
+__device__ inline void render_depth(ftl::cuda::TextureObject<int> &depth, ftl::render::SplatParams &params, const float3 &worldPos) {
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth.width() && cy < depth.height()) {
+		atomicMin(&depth(cx,cy), d * 1000.0f);
+	}
+}
+
+__device__ inline void render_fragment(
+		ftl::cuda::TextureObject<int> &depth_in,
+		ftl::cuda::TextureObject<float4> &normal_out,
+		ftl::cuda::TextureObject<uchar4> &colour_out,
+		ftl::render::SplatParams &params, const fragment &frag) {
+	const float3 camPos = params.m_viewMatrix * frag.point;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth_in.width() && cy < depth_in.height()) {
+		if (depth_in(cx,cy) == (int)(d * 1000.0f)) {
+			colour_out(cx,cy) = frag.colour;
+			normal_out(cx,cy) = make_float4(frag.normal, 0.0f);
+		}
+	}
+}
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float3 screen_centroid(
+		cudaTextureObject_t pointset,
+		const float2 &suv,
+		const int2 &uv,
+		const ftl::render::SplatParams &params,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = params.m_viewMatrix * make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(suv - params.camera.cameraToKinectScreenFloat(samplePoint)), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) pos = pos / weights;
+	return pos;
+}
+
+/**
+ * Estimate a point set surface point from an existing point in the set.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+};
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int R>
+__device__ float mls_point_energy(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				weights += weight;
+			//}
+		}
+	}
+
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int M>
+__device__ float mls_point_energy(
+		const float3 (&pointset)[M],
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int i=0; i<M; ++i) {
+		const float3 samplePoint = pointset[i];
+		const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+		weights += weight;
+	}
+
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near an existing and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near a given point and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+}
+}
+
+#endif  // _FTL_MLS_CUDA_HPP_
diff --git a/applications/reconstruct/src/splat_params.hpp b/applications/reconstruct/src/splat_params.hpp
index e38e4447286bf6c70f6c753deed35154b6aafd8a..cb2d3febda3364a3407264711620a25f9dc116a1 100644
--- a/applications/reconstruct/src/splat_params.hpp
+++ b/applications/reconstruct/src/splat_params.hpp
@@ -9,6 +9,7 @@ namespace ftl {
 namespace render {
 
 static const uint kShowBlockBorders = 0x0001;
+static const uint kNoSplatting = 0x0002;
 
 struct __align__(16) SplatParams {
 	float4x4 m_viewMatrix;
@@ -16,6 +17,7 @@ struct __align__(16) SplatParams {
 
 	uint m_flags;
 	float voxelSize;
+	float depthThreshold;
 
 	DepthCameraParams camera;
 };
diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp
index 92d70a1c50d06e985fcb0faa1ef8f0bd5b961f86..1f7a18f66af2a0e6cd30fac3e4c139355028f6db 100644
--- a/applications/reconstruct/src/splat_render.cpp
+++ b/applications/reconstruct/src/splat_render.cpp
@@ -24,9 +24,18 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) {
 		depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
 	}
+	if ((unsigned int)depth3_.width() != camera.width || (unsigned int)depth3_.height() != camera.height) {
+		depth3_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
+	}
 	if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) {
 		colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height);
 	}
+	if ((unsigned int)colour_tmp_.width() != camera.width || (unsigned int)colour_tmp_.height() != camera.height) {
+		colour_tmp_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
+	if ((unsigned int)normal1_.width() != camera.width || (unsigned int)normal1_.height() != camera.height) {
+		normal1_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
 	if ((unsigned int)depth2_.width() != camera.width || (unsigned int)depth2_.height() != camera.height) {
 		depth2_ = ftl::cuda::TextureObject<float>(camera.width, camera.height);
 	}
@@ -56,30 +65,31 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if (scene_->value("voxels", false)) {
 		// TODO:(Nick) Stereo for voxel version
 		ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream);
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
-		ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		//ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
 		src->writeFrames(colour1_, depth2_, stream);
 	} else {
-		//ftl::cuda::clear_colour(colour1_, stream);
 		ftl::cuda::clear_depth(depth1_, stream);
+		ftl::cuda::clear_depth(depth3_, stream);
 		ftl::cuda::clear_depth(depth2_, stream);
-		ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream);
-		//ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream);
+		ftl::cuda::clear_colour(colour2_, stream);
+		ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 
-		// Splat the depth from first DIBR to expand the points
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		// Step 1: Put all points into virtual view to gather them
+		//ftl::cuda::dibr_raw(depth1_, scene_->cameraCount(), params, stream);
 
-		// Alternative to above...
-		//ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream);
-
-
-		// Use reverse sampling to obtain more colour details
-		// Should use nearest neighbor point depths to verify which camera to use
-		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		// Step 2: For each point, use a warp to do MLS and up sample
+		//ftl::cuda::mls_render_depth(depth1_, depth3_, params, scene_->cameraCount(), stream);
 
 		if (src->getChannel() == ftl::rgbd::kChanDepth) {
-			ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
-			src->writeFrames(colour1_, depth2_, stream);
+			//ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		} else if (src->getChannel() == ftl::rgbd::kChanRight) {
 			// Adjust pose to right eye position
 			Eigen::Affine3f transform(Eigen::Translation3f(camera.baseline,0.0f,0.0f));
@@ -88,10 +98,16 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 			params.m_viewMatrixInverse = MatrixConversion::toCUDA(matrix);
 
 			ftl::cuda::clear_depth(depth1_, stream);
-			ftl::cuda::dibr(depth1_, colour2_, scene_->cameraCount(), params, stream);
+			ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 			src->writeFrames(colour1_, colour2_, stream);
 		} else {
-			src->writeFrames(colour1_, depth2_, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		}
 	}
 
diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp
index 7737bb5d903d725a55b81d75812b99b52e7ace66..7511e859ff6471dcba18d5aee5518f55045eef9b 100644
--- a/applications/reconstruct/src/splat_render.hpp
+++ b/applications/reconstruct/src/splat_render.hpp
@@ -33,9 +33,12 @@ class Splatter {
 	private:
 	int device_;
 	ftl::cuda::TextureObject<int> depth1_;
+	ftl::cuda::TextureObject<int> depth3_;
 	ftl::cuda::TextureObject<uchar4> colour1_;
+	ftl::cuda::TextureObject<float4> colour_tmp_;
 	ftl::cuda::TextureObject<float> depth2_;
 	ftl::cuda::TextureObject<uchar4> colour2_;
+	ftl::cuda::TextureObject<float4> normal1_;
 	SplatParams params_;
 	ftl::voxhash::SceneRep *scene_;
 };
diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp
index d8fd9cf55649280c20805c8d4f8b9f7b758b2223..f8cbdbb6bd87f7ea9e2275e000ad75de87e9dd11 100644
--- a/applications/reconstruct/src/splat_render_cuda.hpp
+++ b/applications/reconstruct/src/splat_render_cuda.hpp
@@ -10,6 +10,80 @@
 namespace ftl {
 namespace cuda {
 
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0 - l0, n) / denom; 
+        return (t >= 0); 
+    } 
+ 
+    return false; 
+}
+
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0, n) / denom; 
+        return (t >= 0); 
+    }
+    return false; 
+}
+
+__device__ inline bool intersectDisk(const float3 &n, const float3 &p0, float radius, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        float3 p = l0 + l * t; 
+        float3 v = p - p0; 
+        float d2 = dot(v, v); 
+        return (sqrt(d2) <= radius); 
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return false; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        const float3 p = l0 + l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l, t)) { 
+        const float3 p = l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
 /**
  * NOTE: Not strictly isosurface currently since it includes the internals
  * of objects up to at most truncation depth.
@@ -26,11 +100,22 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData,
 // TODO: isosurface_point_cloud
 
 void splat_points(const ftl::cuda::TextureObject<int> &depth_in,
+		const ftl::cuda::TextureObject<uchar4> &colour_in,
+		const ftl::cuda::TextureObject<float4> &normal_in,
 		const ftl::cuda::TextureObject<float> &depth_out,
-		const ftl::render::SplatParams &params, cudaStream_t stream);
+		const ftl::cuda::TextureObject<uchar4> &colour_out, const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<int> &depth_out,
-		const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams,
+		const ftl::cuda::TextureObject<uchar4> &colour_out,
+		const ftl::cuda::TextureObject<float4> &normal_out,
+        const ftl::cuda::TextureObject<float> &confidence_out,
+        const ftl::cuda::TextureObject<float4> &tmp_colour, int numcams,
+		const ftl::render::SplatParams &params, cudaStream_t stream);
+
+/**
+ * Directly render input depth maps to virtual view with clipping.
+ */
+void dibr_raw(const ftl::cuda::TextureObject<int> &depth_out, int numcams,
 		const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<float> &depth_out,
diff --git a/applications/reconstruct/src/voxel_hash.cpp b/applications/reconstruct/src/voxel_hash.cpp
index 14cb75078652001275404e87c09f77f0b9d19385..6f929c746d66cae5c382bf15b2d25e26f6b46702 100644
--- a/applications/reconstruct/src/voxel_hash.cpp
+++ b/applications/reconstruct/src/voxel_hash.cpp
@@ -1,4 +1,5 @@
 #include <ftl/voxel_hash.hpp>
+#include <loguru.hpp>
 
 using ftl::voxhash::HashData;
 using ftl::voxhash::HashParams;
diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/voxel_render.cu
similarity index 65%
rename from applications/reconstruct/src/splat_render.cu
rename to applications/reconstruct/src/voxel_render.cu
index 3addf108dcf89ea8f3069e3ba0daea59e67ae918..46eb6f57c031b0541add1dcd982a9c6d1e690140 100644
--- a/applications/reconstruct/src/splat_render.cu
+++ b/applications/reconstruct/src/voxel_render.cu
@@ -258,143 +258,4 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
 #endif
 }
 
-// ---- Pass 2: Expand the point splats ----------------------------------------
-
-#define SPLAT_RADIUS 7
-#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1)
-#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS)
-#define MAX_VALID 100
-
-__device__ float distance2(float3 a, float3 b) {
-	const float x = a.x-b.x;
-	const float y = a.y-b.y;
-	const float z = a.z-b.z;
-	return x*x+y*y+z*z;
-}
-
-__global__ void splatting_kernel(
-		TextureObject<int> depth_in,
-		TextureObject<float> depth_out, SplatParams params) {
-	// Read an NxN region and
-	// - interpolate a depth value for this pixel
-	// - interpolate an rgb value for this pixel
-	// Must respect depth discontinuities.
-	// How much influence a given neighbour has depends on its depth value
-
-	__shared__ float3 positions[SPLAT_BUFFER_SIZE];
-
-	const int i = threadIdx.y*blockDim.y + threadIdx.x;
-	const int bx = blockIdx.x*blockDim.x;
-	const int by = blockIdx.y*blockDim.y;
-	const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-
-	// const float camMinDepth = params.camera.m_sensorDepthWorldMin;
-	// const float camMaxDepth = params.camera.m_sensorDepthWorldMax;
-
-	for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) {
-		const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-		const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-		if (sx >= depth_in.width() || sy >= depth_in.height()) {
-			positions[j] = make_float3(1000.0f,1000.0f, 1000.0f);
-		} else {
-			positions[j] = params.camera.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f);
-		}
-	}
 
-	__syncthreads();
-
-	if (x >= depth_in.width() || y >= depth_in.height()) return;
-
-	const float voxelSquared = params.voxelSize*params.voxelSize;
-	float mindepth = 1000.0f;
-	int minidx = -1;
-	// float3 minpos;
-
-	//float3 validPos[MAX_VALID];
-	int validIndices[MAX_VALID];
-	int validix = 0;
-
-	for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) {
-		for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) {
-			//const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u;
-			const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS;
-
-			float3 posp = positions[idx];
-			const float d = posp.z;
-			//if (d < camMinDepth || d > camMaxDepth) continue;
-
-			float3 pos = params.camera.kinectDepthToSkeleton(x, y, d);
-			float dist = distance2(pos, posp);
-
-			if (dist < voxelSquared) {
-				// Valid so check for minimum
-				//validPos[validix] = pos;
-				//validIndices[validix++] = idx;
-				if (d < mindepth) {
-					mindepth = d;
-					minidx = idx;
-					// minpos = pos;
-				}	
-			}
-		}
-	}
-
-	if (minidx == -1) {
-		depth_out(x,y) = 0.0f;
-		//colour_out(x,y) = make_uchar4(76,76,82,255);
-		return;
-	}
-
-	//float3 colour = make_float3(0.0f, 0.0f, 0.0f);
-	float depth = 0.0f;
-	float contrib = 0.0f;
-	float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth);  // TODO:(Nick) Mindepth assumption is poor choice.
-
-	//for (int j=0; j<validix; ++j) {
-		const int idx = minidx; //validIndices[j];
-		float3 posp = positions[idx];
-		//float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z);
-		float3 delta = (posp - pos) / 2*params.voxelSize;
-		float dist = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-
-		// It contributes to pixel
-		if (dist < 1.0f && fabs(posp.z - mindepth) < 2*params.voxelSize) {
-			const unsigned int sx = (idx % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-			const unsigned int sy = (idx / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-
-			// Fast and simple trilinear interpolation
-			float c = fabs((1.0f - delta.x) * (1.0f - delta.y) * (1.0f - delta.z));
-			//uchar4 col = colour_in.tex2D((int)sx, (int)sy);
-			//colour.x += col.x*c;
-			//colour.y += col.y*c;
-			//colour.z += col.z*c;
-			contrib += c;
-			depth += posp.z * c;
-		}
-	//}
-
-	// Normalise
-	//colour.x /= contrib;
-	//colour.y /= contrib;
-	//colour.z /= contrib;
-	depth /= contrib;
-
-	depth_out(x,y) = depth;
-	//colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255);
-}
-
-void ftl::cuda::splat_points(const TextureObject<int> &depth_in,
-		const TextureObject<float> &depth_out, const SplatParams &params, cudaStream_t stream) 
-{
-
-	const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params);
-	cudaSafeCall( cudaGetLastError() );
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
diff --git a/components/common/cpp/include/ftl/configurable.hpp b/components/common/cpp/include/ftl/configurable.hpp
index 5bae67b4a9972d5f69d0947871d5be259df36be3..94a080eaed480fa2e57b113e9c8c3d7bc04388bd 100644
--- a/components/common/cpp/include/ftl/configurable.hpp
+++ b/components/common/cpp/include/ftl/configurable.hpp
@@ -68,7 +68,9 @@ class Configurable {
 	template <typename T>
 	T value(const std::string &name, T def) {
 		auto r = get<T>(name);
-		return (r) ? *r : def;
+		if (r) return *r;
+		(*config_)[name] = def;
+		return def;
 	}
 
 	/**