diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt
index ae06aab6d142ba8a2ad3a1e0970905e3a70468e9..1e55c671c87487b995b0dd772495ff8a612a8c78 100644
--- a/applications/reconstruct/CMakeLists.txt
+++ b/applications/reconstruct/CMakeLists.txt
@@ -10,7 +10,7 @@ set(REPSRC
 	src/garbage.cu
 	src/integrators.cu
 	#src/ray_cast_sdf.cu
-	src/splat_render.cu
+	src/voxel_render.cu
 	src/camera_util.cu
 	src/voxel_hash.cu
 	src/voxel_hash.cpp
@@ -19,6 +19,7 @@ set(REPSRC
 	#src/virtual_source.cpp
 	src/splat_render.cpp
 	src/dibr.cu
+	src/mls.cu
 	src/depth_camera.cu
 	src/depth_camera.cpp
 )
diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp
index b7a3b44426bf6fcf490af67f38461ca849ade428..ae85cfbd785fb72f79225ae28278bf4862b680db 100644
--- a/applications/reconstruct/include/ftl/cuda_operators.hpp
+++ b/applications/reconstruct/include/ftl/cuda_operators.hpp
@@ -406,6 +406,12 @@ inline __host__ __device__ float length(float3 v)
     return sqrtf(dot(v, v));
 }
 
+// length squared
+inline __host__ __device__ float length2(const float3 &v)
+{
+    return dot(v, v);
+}
+
 // normalize
 inline __host__ __device__ float3 normalize(float3 v)
 {
diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu
index 9a36ea452225c1a174d0ba6ced0baf98471688eb..7a322eaf733230772e72c50722e849335bf7e891 100644
--- a/applications/reconstruct/src/depth_camera.cu
+++ b/applications/reconstruct/src/depth_camera.cu
@@ -47,6 +47,21 @@ void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStre
 	clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_to_zero_kernel(ftl::cuda::TextureObject<float> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = 0.0f; //PINF;
+	}
+}
+
+void ftl::cuda::clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_to_zero_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 __global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -79,6 +94,21 @@ void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cuda
 	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_colour_kernel(ftl::cuda::TextureObject<float4> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = make_float4(0.0f);
+	}
+}
+
+void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 // ===== Type convert =====
 
 template <typename A, typename B>
@@ -103,331 +133,6 @@ void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda:
 	convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale);
 }
 
-/// ===== MLS Smooth
-
-// TODO:(Nick) Put this in a common location (used in integrators.cu)
-extern __device__ float spatialWeighting(float r);
-extern __device__ float spatialWeighting(float r, float h);
-
-/*
- * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
- * Real-time foreground-background segmentation using codebook model.
- * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
- */
- __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
-	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
-	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
-	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
-	const float p_2 = xv_2 / v_2;
-	return sqrt(x_2 - p_2);
-}
-
-__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
-	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
-	return length(delta);
-}
-
-/*
- * Colour weighting as suggested in:
- * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
- * c = colour distance
- */
- __device__ float colourWeighting(float c) {
-	const float h = c_hashParams.m_colourSmoothing;
-	if (c >= h) return 0.0f;
-	float ch = c / h;
-	ch = 1.0f - ch*ch;
-	return ch*ch*ch*ch;
-}
-
-#define WINDOW_RADIUS 5
-
-__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > 0.0f) {
-						wpos += weight* (camera.pose * camPos);
-						weights += weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-
-				// TODO:(Nick) dot product of normals < 0 means the point
-				// should be ignored with a weight of 0 since it is facing the wrong direction
-				// May be good to simply weight using the dot product to give
-				// a stronger weight to those whose normals are closer
-
-				float weight = spatialWeighting(length(pf - camPos), h);
-
-				if (weight > 0.0f) {
-					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
-					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
-
-						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-
-						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
-							pos += weight*camPos; // (camera.pose * camPos);
-							weights += weight;
-						}
-					}
-				}			
-			//}
-		}
-	}
-
-	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
-
-	return weights;
-}
-
-__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > weights) {
-						pos = weight* (camera.pose * camPos);
-						weights = weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	wpos += pos;
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-
-	//float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y);
-	const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y));
-	if (worldPos.z == MINF) return 0.0f;
-
-	float weight = spatialWeighting(length(mPos - worldPos));
-
-	if (weight > 0.0f) {
-		wpos += weight* (worldPos);
-		weights += weight;
-	}
-
-	return weights;
-}
-
-__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = output.width();
-	const int height = output.height();
-
-	const DepthCameraCUDA &mainCamera = c_cameras[cam];
-
-	if (x < width && y < height) {
-
-		const float depth = tex2D<float>(mainCamera.depth, x, y);
-		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
-		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
-		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
-
-		float3 wpos = make_float3(0.0f);
-		float3 wnorm = make_float3(0.0f);
-		float weights = 0.0f;
-
-		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
-			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
-
-			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
-					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
-					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
-
-				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
-					for (uint cam2=0; cam2<numcams; ++cam2) {
-						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
-						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
-
-						// Previous approach
-						//if (cam2 == cam) continue;
-						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
-					}
-					wpos /= weights;
-				} else {
-					weights = 1000.0f;
-					wpos = mPos;
-				} 
-
-				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-
-				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
-
-				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
-				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
-				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-				//}
-			}
-		}
-	}
-}
-
-void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
-	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-#define RESAMPLE_RADIUS 7
-
-__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = depthin.width();
-	const int height = depthin.height();
-
-	if (x < width && y < height) {
-
-		//const int depth = depthin.tex2D((int)x, (int)y);
-		//if (depth != 0x7FFFFFFF) {
-		//	depthout(x,y) = (float)depth / 1000.0f;
-		//	return;
-		//}
-
-		struct map_t {
-			int d;
-			int quad;
-		};
-
-		map_t mappings[5];
-		int mapidx = 0;
-
-		for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) {
-			for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) {
-
-				const int depth = depthin.tex2D((int)x+u, (int)y+v);
-				const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v);
-				
-				if (depth != 0x7FFFFFFF) {
-					int i=0;
-					for (i=0; i<mapidx; ++i) {
-						if (abs(mappings[i].d - depth) < 100) {
-							if (u < 0 && v < 0) mappings[i].quad |= 0x1;
-							if (u > 0 && v < 0) mappings[i].quad |= 0x2;
-							if (u > 0 && v > 0) mappings[i].quad |= 0x4;
-							if (u < 0 && v > 0) mappings[i].quad |= 0x8;
-							break;
-						}
-					}
-					if (i == mapidx && i < 5) {
-						mappings[mapidx].d = depth;
-						mappings[mapidx].quad = 0;
-						if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1;
-						if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2;
-						if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4;
-						if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8;
-						++mapidx;
-					} else {
-						//printf("EXCEEDED\n");
-					}
-				}
-			}
-		}
-
-		int bestdepth = 1000000;
-		//int count = 0;
-		for (int i=0; i<mapidx; ++i) {
-			if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15) count ++;
-		}
-
-		//depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f;
-
-		if (bestdepth < 1000000) {
-			depthout(x,y) = (float)bestdepth / 1000.0f;
-		}
-	}
-}
-
-void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams &params, cudaStream_t stream) {
-	const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-
 /// ===== Median Filter ======
 
 #define WINDOW_SIZE 3
diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp
index 552c2e59d44206c089fee68124ca814d3e752ad6..26bfcad73f5ae72f20cfe2dd29d0e91c62fca62b 100644
--- a/applications/reconstruct/src/depth_camera_cuda.hpp
+++ b/applications/reconstruct/src/depth_camera_cuda.hpp
@@ -10,8 +10,10 @@ namespace cuda {
 
 void clear_depth(const TextureObject<float> &depth, cudaStream_t stream);
 void clear_depth(const TextureObject<int> &depth, cudaStream_t stream);
+void clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream);
 void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream);
+void clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 
 void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream);
 
@@ -21,7 +23,7 @@ void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureO
 
 void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream);
 
-void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams &params, cudaStream_t stream);
+void mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const ftl::render::SplatParams &params, int numcams, cudaStream_t stream);
 
 void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams &params, cudaStream_t stream);
 
diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu
index 9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f..acb86216a7eecd6e10f3ebf3a3b999964b56cf09 100644
--- a/applications/reconstruct/src/dibr.cu
+++ b/applications/reconstruct/src/dibr.cu
@@ -1,12 +1,20 @@
 #include "splat_render_cuda.hpp"
+#include "depth_camera_cuda.hpp"
 #include <cuda_runtime.h>
 
 #include <ftl/cuda_matrix_util.hpp>
 
 #include "splat_params.hpp"
+#include "mls_cuda.hpp"
 #include <ftl/depth_camera.hpp>
 
 #define T_PER_BLOCK 8
+#define UPSAMPLE_FACTOR 1.8f
+#define WARP_SIZE 32
+#define DEPTH_THRESHOLD 0.05f
+#define UPSAMPLE_MAX 60
+#define MAX_ITERATIONS 10
+#define SPATIAL_SMOOTHING 0.01f
 
 using ftl::cuda::TextureObject;
 using ftl::render::SplatParams;
@@ -23,55 +31,343 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) {
 	}
 }
 
+__device__ inline bool isStable(const float3 &previous, const float3 &estimate, const SplatParams &params, float d) {
+    const float psize = 2.0f * d / params.camera.fx;
+    //printf("PSIZE %f\n", psize);
+    return fabs(previous.x - estimate.x) <= psize &&
+        fabs(previous.y - estimate.y) <= psize &&
+        fabs(previous.z - estimate.z) <= psize;
+}
 
-__global__ void dibr_depthmap_kernel(
-    TextureObject<int> depth, int numcams, SplatParams params) {
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+__global__ void dibr_visibility_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((5.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+    // TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-    const int y = by + threadIdx.y;
-    
-    for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		//float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,camPos.z);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+        float ld = nearest.z;
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+			float3 output;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+                if (ftl::cuda::mls_point_surface<1>(camera.points, make_int2(x,y), params.m_viewMatrixInverse * nearest, output, SPATIAL_SMOOTHING) <= 0.0f) {
+                    /*const unsigned int cx = screenPos.x;
+                    const unsigned int cy = screenPos.y;
+                    if (cx < depth.width() && cy < depth.height()) {
+                        atomicMax(&depth(cx,cy), 10000.0f);
+                    }*/
+                    break;
+                }
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+				output = params.m_viewMatrix * output;
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z + copysignf(0.5f*length(output - nearest), output.z - nearest.z);
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);
+
+                const float2 sp = params.camera.cameraToKinectScreenFloat(output);
+
+                //if (isStable(nearest, output, params, d)) {
+                //if (fabs(sp.x - float(screenPos.x+u)) < 2.0f && fabs(sp.y - float(screenPos.y+v)) < 2.0f) {
+				if (length(output - nearest) <= 2.0f * params.camera.fx / camPos.z) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+                    }
+                    break;
+                }
+
+                /*if (k >= MAX_ITERATIONS-1 && length(output - nearest) <= SPATIAL_SMOOTHING) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+						//atomicMin(&depth(cx,cy), d * 1000.0f);
+						printf("ERR = %f, %f\n", fabs(sp.x - float(screenPos.x+u)), fabs(sp.y - float(screenPos.y+v)));
+                    }
+                }*/
+
+                //nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);  // ld + (d - ld)*0.8f
+                ld = d;
+			}
+		//}
+	}
+}
+
+// ------ Alternative for pass 1: principle surfaces ---------------------------
+
+#define NEIGHBOR_RADIUS 1
+#define MAX_NEIGHBORS ((NEIGHBOR_RADIUS*2+1)*(NEIGHBOR_RADIUS*2+1))
+
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+ __global__ void dibr_visibility_principal_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	__shared__ float3 neighborhood_cache[2*T_PER_BLOCK][MAX_NEIGHBORS];
+	__shared__ int minimum[2*T_PER_BLOCK];
+	__shared__ int maximum[2*T_PER_BLOCK];
+
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2;
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+	// TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
-        
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
+	// TODO:(Nick) Preload point neighbors and transform to eye
+	const int lane = threadIdx.x % WARP_SIZE;
+	if (lane == 0) {
+		minimum[warp] = 100000000;
+		maximum[warp] = -100000000;
+	}
+
+	__syncwarp();
+
+	for (int i=lane; i<MAX_NEIGHBORS; i+=WARP_SIZE) {
+		const int u = (i % (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const int v = (i / (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const float3 point = params.m_viewMatrix * make_float3(tex2D<float4>(camera.points, x+u, y+v));
+		neighborhood_cache[warp][i] = point;
+
+		if (length(point - camPos) <= 0.04f) {
+			atomicMin(&minimum[warp], point.z*1000.0f);
+			atomicMax(&maximum[warp], point.z*1000.0f);
+		}
+	}
+
+	__syncwarp();
+	
+	const float interval = (float(maximum[warp])/1000.0f - float(minimum[warp]) / 1000.0f) / float(MAX_ITERATIONS);
+	//if (y == 200) printf("interval: %f\n", interval);
+
+	// TODO:(Nick) Find min and max depths of neighbors to estimate z bounds
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        //float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		// TODO: Use min point as estimate
+		float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,float(minimum[warp])/1000.0f);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+				const float energy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS>(neighborhood_cache[warp], nearest, SPATIAL_SMOOTHING);
+				
+				if (energy <= 0.0f) break;
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z;
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d+interval);
+				
+				if (energy >= 0.1f) {
+					const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+					if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+					}
+					break;
+				}
+			}
+		//}
+	}
+}
+
+__device__ inline float4 make_float4(const uchar4 &c) {
+    return make_float4(c.x,c.y,c.z,c.w);
+}
+
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+__global__ void dibr_attribute_contrib_kernel(
+        TextureObject<int> depth_in,
+        TextureObject<float4> colour_out,
+        TextureObject<float4> normal_out,
+        TextureObject<float> contrib_out, int cam,
+        SplatParams params) {
         
-        const float3 worldPos = make_float3(d);
-        const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
 
+    const int upsample = min(UPSAMPLE_MAX, int((10.0f*r) * params.camera.fx / camPos.z));
 
-        if (cx < depth.width() && cy < depth.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-           		//colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-            //}
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth_in.width() || screenPos.y - upsample >= depth_in.height()) return;
             
-            atomicMin(&depth(cx,cy), camPos.z * 1000.0f);
+    // Is this point near the actual surface and therefore a contributor?
+    const float d = ((float)depth_in.tex2D((int)screenPos.x, (int)screenPos.y)/1000.0f);
+    if (abs(d - camPos.z) > DEPTH_THRESHOLD) return;
+
+    // TODO:(Nick) Should just one thread load these to shared mem?
+    const float4 colour = make_float4(tex2D<uchar4>(camera.colour, x, y));
+    const float4 normal = tex2D<float4>(camera.normal, x, y);
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Use the depth buffer to determine this pixels 3D position in camera space
+        const float d = ((float)depth_in.tex2D(screenPos.x+u, screenPos.y+v)/1000.0f);
+		const float3 nearest = params.camera.kinectDepthToSkeleton((int)(screenPos.x+u),(int)(screenPos.y+v),d);
+
+        // What is contribution of our current point at this pixel?
+        const float weight = ftl::cuda::spatialWeighting(length(nearest - camPos), SPATIAL_SMOOTHING);
+        if (screenPos.x+u < colour_out.width() && screenPos.y+v < colour_out.height() && weight > 0.0f) {  // TODO: Use confidence threshold here
+            const float4 wcolour = colour * weight;
+            const float4 wnormal = normal * weight;
+
+            // Add this points contribution to the pixel buffer
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v), wcolour.x);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+1, wcolour.y);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+2, wcolour.z);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+3, wcolour.w);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v), wnormal.x);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+1, wnormal.y);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+2, wnormal.z);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+3, wnormal.w);
+            atomicAdd(&contrib_out(screenPos.x+u, screenPos.y+v), weight);
         }
-    }
+	}
 }
 
-
-__global__ void dibr_kernel(
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+/*__global__ void dibr_attribute_contrib_kernel(
     TextureObject<int> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
+	TextureObject<uchar4> colour_out,
+	TextureObject<float4> normal_out, int numcams, SplatParams params) {
 
     const int i = threadIdx.y*blockDim.y + threadIdx.x;
     const int bx = blockIdx.x*blockDim.x;
@@ -80,119 +376,89 @@ __global__ void dibr_kernel(
     const int y = by + threadIdx.y;
     
     for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+        const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[j];
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
+        float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+        float r = (camera.poseInverse * worldPos).z;
+        //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), worldPos, 0.02f) < 0.001f) continue;
+        if (worldPos.x == MINF) continue;
         
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
-        
-        const float3 worldPos = make_float3(d);
         const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
-
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+        // Estimate upsample factor using ratio of source depth and output depth
 
+		const int upsample = min(15, (int)(UPSAMPLE_FACTOR * (r / camPos.z))+1);
+		const float upfactor = 2.0f / (float)(upsample);
 
-        if (cx < colour_out.width() && cy < colour_out.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-
-            if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) {
-				   colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-				   //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255);
+        for (int v=0; v<upsample; ++v) {
+            for (int u=0; u<upsample; ++u) {
+                float3 point;
+                const ftl::cuda::fragment nearest = ftl::cuda::upsampled_point(camera.points, camera.normal, camera.colour,
+                    make_float2((float)x-1.0f+u*upfactor,(float)y-1.0f+v*upfactor));
+                //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), nearest, point, 0.02f) < 0.001f) continue;
+                ftl::cuda::render_fragment(depth_in, normal_out, colour_out, params, nearest);
             }
-                   
-
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
         }
     }
-}
-
-__device__ inline float4 make_float4(const uchar4 &c) {
-    return make_float4(c.x,c.y,c.z,c.w);
-}
-
-__global__ void dibr_kernel_rev(
-    TextureObject<float> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
-
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-	
-	float camd = depth_in.tex2D((int)x,(int)y);
-	if (camd < 0.01f)	return;
-	if (camd >= params.camera.m_sensorDepthWorldMax) return;
-	
-	const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd);
-    float mindiff = 1000.0f;
-    float4 col = make_float4(0.0f,0.0f,0.0f,0.0f);
-    int count = 0;
+}*/
 
-    for (int j=0; j<numcams; ++j) {
-		const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
-		
-		const float3 camPos = camera.poseInverse * worldPos;
-	    const float2 screenPosf = camera.params.cameraToKinectScreenFloat(camPos);
-		const uint2 screenPos = make_uint2(make_int2(screenPosf));
-		
-		if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-		const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
 
-        if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) {
-			float d = tex2D<float>(camera.depth, (int)cx, (int)cy);
-            float camdiff = fabs(camPos.z-d);
-            
-            if (camdiff < mindiff) {
-                mindiff = camdiff;
-                col += make_float4(tex2D<uchar4>(camera.colour,cx,cy));
-                ++count;
-            }
+__global__ void dibr_normalise_kernel(
+        TextureObject<float4> colour_in,
+        TextureObject<uchar4> colour_out,
+        TextureObject<float4> normals,
+        TextureObject<float> contribs) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-            //if (camdiff < 0.1f) {
-            //	colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy);
-            //} else {
-				//colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
-		}
-    }
+	if (x < colour_in.width() && y < colour_in.height()) {
+        const float4 colour = colour_in.tex2D((int)x,(int)y);
+        const float4 normal = normals.tex2D((int)x,(int)y);
+        const float contrib = contribs.tex2D((int)x,(int)y);
 
-    if (count > 0) {
-        col = col / (float)count;
-        colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255);
-    } else {
-        colour_out(x,y) = make_uchar4(76,76,76,255);
-    }
+        if (contrib > 0.0f) {
+            colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0);
+            normals(x,y) = normal / contrib;
+        }
+	}
 }
 
 void ftl::cuda::dibr(const TextureObject<int> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
-
+        const TextureObject<uchar4> &colour_out,
+        const TextureObject<float4> &normal_out,
+        const TextureObject<float> &confidence_out,
+        const TextureObject<float4> &tmp_colour,
+        int numcams,
+        const SplatParams &params,
+        cudaStream_t stream) {
+
+	const dim3 sgridSize((depth_out.width() + 2 - 1)/2, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 sblockSize(2*WARP_SIZE, T_PER_BLOCK);
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    ftl::cuda::clear_to_zero(confidence_out, stream);
+    ftl::cuda::clear_colour(tmp_colour, stream);
+    ftl::cuda::clear_colour(normal_out, stream);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
-    dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+    int i=3;
+    // Pass 1, merge a depth map from each camera.
+	for (int i=0; i<numcams; ++i)
+        dibr_visibility_principal_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, i, params);
+
+    // Pass 2, accumulate all point contributions to pixels
+    for (int i=0; i<numcams; ++i)
+        dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, tmp_colour, normal_out, confidence_out, i, params);
+
+    // Pass 3, normalise contributions
+    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(tmp_colour, colour_out, normal_out, confidence_out);
+
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
@@ -200,22 +466,21 @@ void ftl::cuda::dibr(const TextureObject<int> &depth_out,
 #endif
 }
 
-void ftl::cuda::dibr(const TextureObject<float> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
+void ftl::cuda::dibr_raw(const TextureObject<int> &depth_out,
+    	int numcams, const SplatParams &params, cudaStream_t stream) {
 
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+	//dibr_depthmap_direct_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 }
+
diff --git a/applications/reconstruct/src/mls.cu b/applications/reconstruct/src/mls.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71201a017cf1ebab6e06257e9425403df50427d3
--- /dev/null
+++ b/applications/reconstruct/src/mls.cu
@@ -0,0 +1,273 @@
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include "depth_camera_cuda.hpp"
+
+#include "mls_cuda.hpp"
+
+#define T_PER_BLOCK 16
+#define MINF __int_as_float(0xff800000)
+
+using ftl::voxhash::DepthCameraCUDA;
+using ftl::voxhash::HashData;
+using ftl::voxhash::HashParams;
+using ftl::cuda::TextureObject;
+using ftl::render::SplatParams;
+
+extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+extern __constant__ HashParams c_hashParams;
+
+/// ===== MLS Smooth
+
+/*
+ * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
+ * Real-time foreground-background segmentation using codebook model.
+ * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
+ */
+ __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
+	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
+	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
+	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
+	const float p_2 = xv_2 / v_2;
+	return sqrt(x_2 - p_2);
+}
+
+__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
+	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
+	return length(delta);
+}
+
+/*
+ * Colour weighting as suggested in:
+ * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
+ * c = colour distance
+ */
+ __device__ float colourWeighting(float c) {
+	const float h = c_hashParams.m_colourSmoothing;
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
+#define WINDOW_RADIUS 5
+
+__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), c_hashParams.m_spatialSmoothing);
+
+				if (weight > 0.0f) {
+					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+					weight *= colourWeighting(colordiffFloat2(c1,c2));
+
+					if (weight > 0.0f) {
+						wpos += weight* (camera.pose * camPos);
+						weights += weight;
+					}
+				}			
+			//}
+		}
+	}
+
+	//wpos += (camera.pose * pos);
+
+	return weights;
+}
+
+__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+
+				// TODO:(Nick) dot product of normals < 0 means the point
+				// should be ignored with a weight of 0 since it is facing the wrong direction
+				// May be good to simply weight using the dot product to give
+				// a stronger weight to those whose normals are closer
+
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), h);
+
+				if (weight > 0.0f) {
+					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
+					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
+
+						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+
+						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
+							pos += weight*camPos; // (camera.pose * camPos);
+							weights += weight;
+						}
+					}
+				}			
+			//}
+		}
+	}
+
+	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
+
+	return weights;
+}
+
+
+__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	const DepthCameraCUDA &mainCamera = c_cameras[cam];
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
+				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				//}
+			}
+		}
+	}
+}
+
+void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+
+// ===== Render Depth using MLS ================================================
+
+#define MAX_UPSAMPLE 5
+#define SAMPLE_BUFFER ((2*MAX_UPSAMPLE+1)*(2*MAX_UPSAMPLE+1))
+#define WARP_SIZE 32
+#define BLOCK_WIDTH 4
+#define MLS_RADIUS 5
+#define MLS_WIDTH (2*MLS_RADIUS+1)
+#define MLS_SAMPLES (MLS_WIDTH*MLS_WIDTH)
+
+__global__ void mls_render_depth_kernel(const TextureObject<int> input, TextureObject<int> output, SplatParams params, int numcams) {
+	/*const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				//if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				if (screenPos.x < output.width() && screenPos.y < output.height()) {
+					output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				}
+			}
+		}
+	}*/
+}
+
+
+void ftl::cuda::mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const SplatParams &params, int numcams, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_render_depth_kernel<<<gridSize, blockSize, 0, stream>>>(input, output, params, numcams);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
diff --git a/applications/reconstruct/src/mls_cuda.hpp b/applications/reconstruct/src/mls_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..420b3beb0a5603abbc1a34e5cfeba5b99912408d
--- /dev/null
+++ b/applications/reconstruct/src/mls_cuda.hpp
@@ -0,0 +1,377 @@
+#ifndef _FTL_MLS_CUDA_HPP_
+#define _FTL_MLS_CUDA_HPP_
+
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include "splat_params.hpp"
+
+__device__ inline float3 make_float3(const uchar4 &c) {
+	return make_float3((float)c.x,(float)c.y,(float)c.z);
+}
+
+__device__ inline uchar4 make_uchar4(const float3 &c) {
+	return make_uchar4(c.x,c.y,c.z,255);
+}
+
+namespace ftl {
+namespace cuda {
+
+/*
+ * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007.
+ * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras
+ *     r = distance between points
+ *     h = smoothing parameter in meters (default 4cm)
+ */
+__device__ inline float spatialWeighting(float r, float h) {
+	if (r >= h) return 0.0f;
+	float rh = r / h;
+	rh = 1.0f - rh*rh;
+	return rh*rh*rh*rh;
+}
+
+__device__ float colourWeighting(float c);
+
+struct fragment {
+	float3 point;
+	float3 normal;
+	uchar4 colour;
+};
+
+__device__ inline float3 upsampled_point(cudaTextureObject_t pointset, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+
+	// R is the centroid of surrounding points.
+	R /= (D1+D2+D3+D4);
+
+	// FIXME: Should not use centroid but instead sample the surface at this point
+	// Use plane estimate at point to get "centroid" and then do the spatial weighted sample?
+	return R;
+}
+
+__device__ inline fragment upsampled_point(cudaTextureObject_t pointset,
+		cudaTextureObject_t normals, cudaTextureObject_t colours, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+	float3 C = make_float3(0.0f, 0.0f, 0.0f);
+
+	// TODO:(Nick) Don't upsample points if distance is too great
+
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+	N += D1 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y)));
+	C += D1 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y)));
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+	N += D2 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y+1.0f)));
+	C += D2 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y+1.0f)));
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+	N += D3 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y)));
+	C += D3 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y)));
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+	N += D4 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y+1.0f)));
+	C += D4 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y+1.0f)));
+
+	return {R / (D1+D2+D3+D4), N / (D1+D2+D3+D4), make_uchar4(C / (D1+D2+D3+D4))};
+}
+
+__device__ inline void render_depth(ftl::cuda::TextureObject<int> &depth, ftl::render::SplatParams &params, const float3 &worldPos) {
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth.width() && cy < depth.height()) {
+		atomicMin(&depth(cx,cy), d * 1000.0f);
+	}
+}
+
+__device__ inline void render_fragment(
+		ftl::cuda::TextureObject<int> &depth_in,
+		ftl::cuda::TextureObject<float4> &normal_out,
+		ftl::cuda::TextureObject<uchar4> &colour_out,
+		ftl::render::SplatParams &params, const fragment &frag) {
+	const float3 camPos = params.m_viewMatrix * frag.point;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth_in.width() && cy < depth_in.height()) {
+		if (depth_in(cx,cy) == (int)(d * 1000.0f)) {
+			colour_out(cx,cy) = frag.colour;
+			normal_out(cx,cy) = make_float4(frag.normal, 0.0f);
+		}
+	}
+}
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float3 screen_centroid(
+		cudaTextureObject_t pointset,
+		const float2 &suv,
+		const int2 &uv,
+		const ftl::render::SplatParams &params,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = params.m_viewMatrix * make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(suv - params.camera.cameraToKinectScreenFloat(samplePoint)), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) pos = pos / weights;
+	return pos;
+}
+
+/**
+ * Estimate a point set surface point from an existing point in the set.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+};
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int R>
+__device__ float mls_point_energy(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				weights += weight;
+			//}
+		}
+	}
+
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int M>
+__device__ float mls_point_energy(
+		const float3 (&pointset)[M],
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int i=0; i<M; ++i) {
+		const float3 samplePoint = pointset[i];
+		const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+		weights += weight;
+	}
+
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near an existing and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near a given point and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+}
+}
+
+#endif  // _FTL_MLS_CUDA_HPP_
diff --git a/applications/reconstruct/src/splat_params.hpp b/applications/reconstruct/src/splat_params.hpp
index e38e4447286bf6c70f6c753deed35154b6aafd8a..cb2d3febda3364a3407264711620a25f9dc116a1 100644
--- a/applications/reconstruct/src/splat_params.hpp
+++ b/applications/reconstruct/src/splat_params.hpp
@@ -9,6 +9,7 @@ namespace ftl {
 namespace render {
 
 static const uint kShowBlockBorders = 0x0001;
+static const uint kNoSplatting = 0x0002;
 
 struct __align__(16) SplatParams {
 	float4x4 m_viewMatrix;
@@ -16,6 +17,7 @@ struct __align__(16) SplatParams {
 
 	uint m_flags;
 	float voxelSize;
+	float depthThreshold;
 
 	DepthCameraParams camera;
 };
diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp
index 92d70a1c50d06e985fcb0faa1ef8f0bd5b961f86..1f7a18f66af2a0e6cd30fac3e4c139355028f6db 100644
--- a/applications/reconstruct/src/splat_render.cpp
+++ b/applications/reconstruct/src/splat_render.cpp
@@ -24,9 +24,18 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) {
 		depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
 	}
+	if ((unsigned int)depth3_.width() != camera.width || (unsigned int)depth3_.height() != camera.height) {
+		depth3_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
+	}
 	if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) {
 		colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height);
 	}
+	if ((unsigned int)colour_tmp_.width() != camera.width || (unsigned int)colour_tmp_.height() != camera.height) {
+		colour_tmp_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
+	if ((unsigned int)normal1_.width() != camera.width || (unsigned int)normal1_.height() != camera.height) {
+		normal1_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
 	if ((unsigned int)depth2_.width() != camera.width || (unsigned int)depth2_.height() != camera.height) {
 		depth2_ = ftl::cuda::TextureObject<float>(camera.width, camera.height);
 	}
@@ -56,30 +65,31 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if (scene_->value("voxels", false)) {
 		// TODO:(Nick) Stereo for voxel version
 		ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream);
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
-		ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		//ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
 		src->writeFrames(colour1_, depth2_, stream);
 	} else {
-		//ftl::cuda::clear_colour(colour1_, stream);
 		ftl::cuda::clear_depth(depth1_, stream);
+		ftl::cuda::clear_depth(depth3_, stream);
 		ftl::cuda::clear_depth(depth2_, stream);
-		ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream);
-		//ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream);
+		ftl::cuda::clear_colour(colour2_, stream);
+		ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 
-		// Splat the depth from first DIBR to expand the points
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		// Step 1: Put all points into virtual view to gather them
+		//ftl::cuda::dibr_raw(depth1_, scene_->cameraCount(), params, stream);
 
-		// Alternative to above...
-		//ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream);
-
-
-		// Use reverse sampling to obtain more colour details
-		// Should use nearest neighbor point depths to verify which camera to use
-		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		// Step 2: For each point, use a warp to do MLS and up sample
+		//ftl::cuda::mls_render_depth(depth1_, depth3_, params, scene_->cameraCount(), stream);
 
 		if (src->getChannel() == ftl::rgbd::kChanDepth) {
-			ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
-			src->writeFrames(colour1_, depth2_, stream);
+			//ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		} else if (src->getChannel() == ftl::rgbd::kChanRight) {
 			// Adjust pose to right eye position
 			Eigen::Affine3f transform(Eigen::Translation3f(camera.baseline,0.0f,0.0f));
@@ -88,10 +98,16 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 			params.m_viewMatrixInverse = MatrixConversion::toCUDA(matrix);
 
 			ftl::cuda::clear_depth(depth1_, stream);
-			ftl::cuda::dibr(depth1_, colour2_, scene_->cameraCount(), params, stream);
+			ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 			src->writeFrames(colour1_, colour2_, stream);
 		} else {
-			src->writeFrames(colour1_, depth2_, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		}
 	}
 
diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp
index 7737bb5d903d725a55b81d75812b99b52e7ace66..7511e859ff6471dcba18d5aee5518f55045eef9b 100644
--- a/applications/reconstruct/src/splat_render.hpp
+++ b/applications/reconstruct/src/splat_render.hpp
@@ -33,9 +33,12 @@ class Splatter {
 	private:
 	int device_;
 	ftl::cuda::TextureObject<int> depth1_;
+	ftl::cuda::TextureObject<int> depth3_;
 	ftl::cuda::TextureObject<uchar4> colour1_;
+	ftl::cuda::TextureObject<float4> colour_tmp_;
 	ftl::cuda::TextureObject<float> depth2_;
 	ftl::cuda::TextureObject<uchar4> colour2_;
+	ftl::cuda::TextureObject<float4> normal1_;
 	SplatParams params_;
 	ftl::voxhash::SceneRep *scene_;
 };
diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp
index d8fd9cf55649280c20805c8d4f8b9f7b758b2223..f8cbdbb6bd87f7ea9e2275e000ad75de87e9dd11 100644
--- a/applications/reconstruct/src/splat_render_cuda.hpp
+++ b/applications/reconstruct/src/splat_render_cuda.hpp
@@ -10,6 +10,80 @@
 namespace ftl {
 namespace cuda {
 
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0 - l0, n) / denom; 
+        return (t >= 0); 
+    } 
+ 
+    return false; 
+}
+
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0, n) / denom; 
+        return (t >= 0); 
+    }
+    return false; 
+}
+
+__device__ inline bool intersectDisk(const float3 &n, const float3 &p0, float radius, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        float3 p = l0 + l * t; 
+        float3 v = p - p0; 
+        float d2 = dot(v, v); 
+        return (sqrt(d2) <= radius); 
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return false; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        const float3 p = l0 + l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l, t)) { 
+        const float3 p = l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
 /**
  * NOTE: Not strictly isosurface currently since it includes the internals
  * of objects up to at most truncation depth.
@@ -26,11 +100,22 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData,
 // TODO: isosurface_point_cloud
 
 void splat_points(const ftl::cuda::TextureObject<int> &depth_in,
+		const ftl::cuda::TextureObject<uchar4> &colour_in,
+		const ftl::cuda::TextureObject<float4> &normal_in,
 		const ftl::cuda::TextureObject<float> &depth_out,
-		const ftl::render::SplatParams &params, cudaStream_t stream);
+		const ftl::cuda::TextureObject<uchar4> &colour_out, const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<int> &depth_out,
-		const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams,
+		const ftl::cuda::TextureObject<uchar4> &colour_out,
+		const ftl::cuda::TextureObject<float4> &normal_out,
+        const ftl::cuda::TextureObject<float> &confidence_out,
+        const ftl::cuda::TextureObject<float4> &tmp_colour, int numcams,
+		const ftl::render::SplatParams &params, cudaStream_t stream);
+
+/**
+ * Directly render input depth maps to virtual view with clipping.
+ */
+void dibr_raw(const ftl::cuda::TextureObject<int> &depth_out, int numcams,
 		const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<float> &depth_out,
diff --git a/applications/reconstruct/src/voxel_hash.cpp b/applications/reconstruct/src/voxel_hash.cpp
index 14cb75078652001275404e87c09f77f0b9d19385..6f929c746d66cae5c382bf15b2d25e26f6b46702 100644
--- a/applications/reconstruct/src/voxel_hash.cpp
+++ b/applications/reconstruct/src/voxel_hash.cpp
@@ -1,4 +1,5 @@
 #include <ftl/voxel_hash.hpp>
+#include <loguru.hpp>
 
 using ftl::voxhash::HashData;
 using ftl::voxhash::HashParams;
diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/voxel_render.cu
similarity index 65%
rename from applications/reconstruct/src/splat_render.cu
rename to applications/reconstruct/src/voxel_render.cu
index 3addf108dcf89ea8f3069e3ba0daea59e67ae918..46eb6f57c031b0541add1dcd982a9c6d1e690140 100644
--- a/applications/reconstruct/src/splat_render.cu
+++ b/applications/reconstruct/src/voxel_render.cu
@@ -258,143 +258,4 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
 #endif
 }
 
-// ---- Pass 2: Expand the point splats ----------------------------------------
-
-#define SPLAT_RADIUS 7
-#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1)
-#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS)
-#define MAX_VALID 100
-
-__device__ float distance2(float3 a, float3 b) {
-	const float x = a.x-b.x;
-	const float y = a.y-b.y;
-	const float z = a.z-b.z;
-	return x*x+y*y+z*z;
-}
-
-__global__ void splatting_kernel(
-		TextureObject<int> depth_in,
-		TextureObject<float> depth_out, SplatParams params) {
-	// Read an NxN region and
-	// - interpolate a depth value for this pixel
-	// - interpolate an rgb value for this pixel
-	// Must respect depth discontinuities.
-	// How much influence a given neighbour has depends on its depth value
-
-	__shared__ float3 positions[SPLAT_BUFFER_SIZE];
-
-	const int i = threadIdx.y*blockDim.y + threadIdx.x;
-	const int bx = blockIdx.x*blockDim.x;
-	const int by = blockIdx.y*blockDim.y;
-	const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-
-	// const float camMinDepth = params.camera.m_sensorDepthWorldMin;
-	// const float camMaxDepth = params.camera.m_sensorDepthWorldMax;
-
-	for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) {
-		const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-		const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-		if (sx >= depth_in.width() || sy >= depth_in.height()) {
-			positions[j] = make_float3(1000.0f,1000.0f, 1000.0f);
-		} else {
-			positions[j] = params.camera.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f);
-		}
-	}
 
-	__syncthreads();
-
-	if (x >= depth_in.width() || y >= depth_in.height()) return;
-
-	const float voxelSquared = params.voxelSize*params.voxelSize;
-	float mindepth = 1000.0f;
-	int minidx = -1;
-	// float3 minpos;
-
-	//float3 validPos[MAX_VALID];
-	int validIndices[MAX_VALID];
-	int validix = 0;
-
-	for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) {
-		for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) {
-			//const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u;
-			const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS;
-
-			float3 posp = positions[idx];
-			const float d = posp.z;
-			//if (d < camMinDepth || d > camMaxDepth) continue;
-
-			float3 pos = params.camera.kinectDepthToSkeleton(x, y, d);
-			float dist = distance2(pos, posp);
-
-			if (dist < voxelSquared) {
-				// Valid so check for minimum
-				//validPos[validix] = pos;
-				//validIndices[validix++] = idx;
-				if (d < mindepth) {
-					mindepth = d;
-					minidx = idx;
-					// minpos = pos;
-				}	
-			}
-		}
-	}
-
-	if (minidx == -1) {
-		depth_out(x,y) = 0.0f;
-		//colour_out(x,y) = make_uchar4(76,76,82,255);
-		return;
-	}
-
-	//float3 colour = make_float3(0.0f, 0.0f, 0.0f);
-	float depth = 0.0f;
-	float contrib = 0.0f;
-	float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth);  // TODO:(Nick) Mindepth assumption is poor choice.
-
-	//for (int j=0; j<validix; ++j) {
-		const int idx = minidx; //validIndices[j];
-		float3 posp = positions[idx];
-		//float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z);
-		float3 delta = (posp - pos) / 2*params.voxelSize;
-		float dist = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-
-		// It contributes to pixel
-		if (dist < 1.0f && fabs(posp.z - mindepth) < 2*params.voxelSize) {
-			const unsigned int sx = (idx % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-			const unsigned int sy = (idx / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-
-			// Fast and simple trilinear interpolation
-			float c = fabs((1.0f - delta.x) * (1.0f - delta.y) * (1.0f - delta.z));
-			//uchar4 col = colour_in.tex2D((int)sx, (int)sy);
-			//colour.x += col.x*c;
-			//colour.y += col.y*c;
-			//colour.z += col.z*c;
-			contrib += c;
-			depth += posp.z * c;
-		}
-	//}
-
-	// Normalise
-	//colour.x /= contrib;
-	//colour.y /= contrib;
-	//colour.z /= contrib;
-	depth /= contrib;
-
-	depth_out(x,y) = depth;
-	//colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255);
-}
-
-void ftl::cuda::splat_points(const TextureObject<int> &depth_in,
-		const TextureObject<float> &depth_out, const SplatParams &params, cudaStream_t stream) 
-{
-
-	const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params);
-	cudaSafeCall( cudaGetLastError() );
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
diff --git a/components/common/cpp/include/ftl/configurable.hpp b/components/common/cpp/include/ftl/configurable.hpp
index 5bae67b4a9972d5f69d0947871d5be259df36be3..94a080eaed480fa2e57b113e9c8c3d7bc04388bd 100644
--- a/components/common/cpp/include/ftl/configurable.hpp
+++ b/components/common/cpp/include/ftl/configurable.hpp
@@ -68,7 +68,9 @@ class Configurable {
 	template <typename T>
 	T value(const std::string &name, T def) {
 		auto r = get<T>(name);
-		return (r) ? *r : def;
+		if (r) return *r;
+		(*config_)[name] = def;
+		return def;
 	}
 
 	/**
diff --git a/components/net/cpp/src/universe.cpp b/components/net/cpp/src/universe.cpp
index 8c50709e2d6253f73849f5560c72eee9a6ff5b83..402c5bed22cc394fd08251f2d2625983b89ed48e 100644
--- a/components/net/cpp/src/universe.cpp
+++ b/components/net/cpp/src/universe.cpp
@@ -138,9 +138,7 @@ int Universe::_setDescriptors() {
 				n = s->_socket();
 			}
 
-			//if (s->isWaiting()) {
-				FD_SET(s->_socket(), &sfdread_);
-			//}
+			FD_SET(s->_socket(), &sfdread_);
 			FD_SET(s->_socket(), &sfderror_);
 		}
 	}
@@ -154,17 +152,7 @@ void Universe::_installBindings(Peer *p) {
 }
 
 void Universe::_installBindings() {
-	/*bind("__subscribe__", [this](const UUID &id, const string &uri) -> bool {
-		LOG(INFO) << "Subscription to " << uri << " by " << id.to_string();
-		unique_lock<shared_mutex> lk(net_mutex_);
-		subscribers_[ftl::URI(uri).to_string()].push_back(id);
-		return true;
-	});
-	
-	bind("__owner__", [this](const std::string &res) -> optional<UUID> {
-		if (owned_.count(res) > 0) return this_peer;
-		else return {};
-	});*/
+
 }
 
 // Note: should be called inside a net lock
@@ -173,6 +161,8 @@ void Universe::_cleanupPeers() {
 	if (ftl::pool.n_idle() == ftl::pool.size()) {
 		if (garbage_.size() > 0) LOG(INFO) << "Garbage collection";
 		while (garbage_.size() > 0) {
+			// FIXME: There is possibly still something with a peer pointer
+			// that is causing this throw an exception sometimes?
 			delete garbage_.front();
 			garbage_.pop_front();
 		}
@@ -287,8 +277,8 @@ void Universe::_run() {
 			continue;
 		}
 
-		// CHECK Could this mutex be the problem!?
 		{
+			// TODO:(Nick) Shared lock unless connection is made
 			UNIQUE_LOCK(net_mutex_,lk);
 
 			//If connection request is waiting
@@ -304,7 +294,7 @@ void Universe::_run() {
 						if (csock != INVALID_SOCKET) {
 							auto p = new Peer(csock, this, &disp_);
 							peers_.push_back(p);
-							_installBindings(p);
+							//_installBindings(p);
 						}
 					}
 				}
diff --git a/components/rgbd-sources/include/ftl/rgbd/streamer.hpp b/components/rgbd-sources/include/ftl/rgbd/streamer.hpp
index 0143bc0b0a030e509629e5bb6185dd553d1c9683..d2beacb2105c38dfa7d3c8317f2e6512cb07c570 100644
--- a/components/rgbd-sources/include/ftl/rgbd/streamer.hpp
+++ b/components/rgbd-sources/include/ftl/rgbd/streamer.hpp
@@ -29,7 +29,9 @@ struct StreamClient {
 
 static const unsigned int kGrabbed = 0x1;
 static const unsigned int kRGB = 0x2;
-static const unsigned int kDepth = 0x4; 
+static const unsigned int kDepth = 0x4;
+
+static const unsigned int kFrameDropLimit = 5;
 
 struct StreamSource {
 	ftl::rgbd::Source *src;
@@ -119,12 +121,19 @@ class Streamer : public ftl::Configurable {
 	int64_t last_frame_;
 	int64_t frame_no_;
 
+	int64_t mspf_;
+	float actual_fps_;
+	//int64_t last_dropped_;
+	//int drop_count_;
+
 	void _schedule();
+	void _schedule(detail::StreamSource *);
 	void _swap(detail::StreamSource *);
 	void _addClient(const std::string &source, int N, int rate, const ftl::UUID &peer, const std::string &dest);
 	void _encodeAndTransmit(detail::StreamSource *src, int chunk);
 	void _encodeChannel1(const cv::Mat &in, std::vector<unsigned char> &out, unsigned int b);
 	bool _encodeChannel2(const cv::Mat &in, std::vector<unsigned char> &out, ftl::rgbd::channel_t c, unsigned int b);
+	void _decideFrameRate(int64_t framesdropped, int64_t msremainder);
 };
 
 }
diff --git a/components/rgbd-sources/src/calibrate.cpp b/components/rgbd-sources/src/calibrate.cpp
index acc580e2aefdd63a3531e168e02b69c06aa855ca..c31fea5c5c2b7885b450a0ebcd51f6e0540dd67f 100644
--- a/components/rgbd-sources/src/calibrate.cpp
+++ b/components/rgbd-sources/src/calibrate.cpp
@@ -159,7 +159,6 @@ void Calibrate::_updateIntrinsics() {
 	// TODO: pass parameters?
 
 	Mat R1, R2, P1, P2;
-	std::pair<Mat, Mat> map1, map2;
 	ftl::rgbd::Camera params();
 
 	if (this->value("use_intrinsics", true)) {
@@ -182,14 +181,14 @@ void Calibrate::_updateIntrinsics() {
 	C_l_ = P1;
 	C_r_ = P2;
 
-	initUndistortRectifyMap(M1_, D1_, R1, P1, img_size_, CV_32FC1, map1.first, map2.first);
-	initUndistortRectifyMap(M2_, D2_, R2, P2, img_size_, CV_32FC1, map1.second, map2.second);
+	initUndistortRectifyMap(M1_, D1_, R1, P1, img_size_, CV_32FC1, map1_.first, map2_.first);
+	initUndistortRectifyMap(M2_, D2_, R2, P2, img_size_, CV_32FC1, map1_.second, map2_.second);
 
 	// CHECK Is this thread safe!!!!
-	map1_gpu_.first.upload(map1.first);
-	map1_gpu_.second.upload(map1.second);
-	map2_gpu_.first.upload(map2.first);
-	map2_gpu_.second.upload(map2.second);
+	map1_gpu_.first.upload(map1_.first);
+	map1_gpu_.second.upload(map1_.second);
+	map2_gpu_.first.upload(map2_.first);
+	map2_gpu_.second.upload(map2_.second);
 }
 
 void Calibrate::rectifyStereo(GpuMat &l, GpuMat &r, Stream &stream) {
@@ -204,6 +203,21 @@ void Calibrate::rectifyStereo(GpuMat &l, GpuMat &r, Stream &stream) {
 	r = r_tmp;
 }
 
+void Calibrate::rectifyStereo(cv::Mat &l, cv::Mat &r) {
+	// cv::cuda::remap() can not use same Mat for input and output
+
+	cv::remap(l, l, map1_.first, map2_.first, cv::INTER_LINEAR, 0, cv::Scalar());
+	cv::remap(r, r, map1_.second, map2_.second, cv::INTER_LINEAR, 0, cv::Scalar());
+
+	/*GpuMat l_tmp(l.size(), l.type());
+	GpuMat r_tmp(r.size(), r.type());
+	cv::cuda::remap(l, l_tmp, map1_gpu_.first, map2_gpu_.first, cv::INTER_LINEAR, 0, cv::Scalar(), stream);
+	cv::cuda::remap(r, r_tmp, map1_gpu_.second, map2_gpu_.second, cv::INTER_LINEAR, 0, cv::Scalar(), stream);
+	stream.waitForCompletion();
+	l = l_tmp;
+	r = r_tmp;*/
+}
+
 bool Calibrate::isCalibrated() {
 	return calibrated_;
 }
\ No newline at end of file
diff --git a/components/rgbd-sources/src/calibrate.hpp b/components/rgbd-sources/src/calibrate.hpp
index c5f4786831edb77435a4d205702e7909517829a9..7f6b262c1e7afcd2852f65a6d62333b5389f7615 100644
--- a/components/rgbd-sources/src/calibrate.hpp
+++ b/components/rgbd-sources/src/calibrate.hpp
@@ -40,6 +40,11 @@ class Calibrate : public ftl::Configurable {
 	 */
 	void rectifyStereo(cv::cuda::GpuMat &l, cv::cuda::GpuMat &r, cv::cuda::Stream &stream);
 
+	/**
+	 * Rectify and remove distortions from from images l and r using cv::remap()
+	 */
+	void rectifyStereo(cv::Mat &l, cv::Mat &r);
+
 	bool isCalibrated();
 
 	void updateCalibration(const ftl::rgbd::Camera &p);
@@ -60,6 +65,8 @@ private:
 	private:
 	bool calibrated_;
 
+	std::pair<cv::Mat, cv::Mat> map1_;
+	std::pair<cv::Mat, cv::Mat> map2_;
 	std::pair<cv::cuda::GpuMat, cv::cuda::GpuMat> map1_gpu_;
 	std::pair<cv::cuda::GpuMat, cv::cuda::GpuMat> map2_gpu_;
 
diff --git a/components/rgbd-sources/src/local.cpp b/components/rgbd-sources/src/local.cpp
index 410cb9f96c0a1c1ca48cadd52300e25b260c3aad..cf5521352d4689798cfc12ada0787f27db252c89 100644
--- a/components/rgbd-sources/src/local.cpp
+++ b/components/rgbd-sources/src/local.cpp
@@ -9,11 +9,13 @@
 #include <thread>
 
 #include "local.hpp"
+#include "calibrate.hpp"
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 #include <opencv2/xphoto.hpp>
 
 using ftl::rgbd::detail::LocalSource;
+using ftl::rgbd::detail::Calibrate;
 using cv::Mat;
 using cv::VideoCapture;
 using cv::Rect;
@@ -27,28 +29,15 @@ using std::this_thread::sleep_for;
 LocalSource::LocalSource(nlohmann::json &config)
 		: Configurable(config), timestamp_(0.0) {
 
-	REQUIRED({
-		{"flip","Switch left and right views","boolean"},
-		{"flip_vert","Rotate image 180 degrees","boolean"},
-		{"nostereo","Force single camera mode","boolean"},
-		{"width","Pixel width of camera source","number"},
-		{"height","Pixel height of camera source","number"},
-		{"max_fps","Maximum frames per second","number"},
-		{"scale","Change the input image or video scaling","number"}
-	});
-
-	flip_ = value("flip", false);
-	flip_v_ = value("flip_vert", false);
 	nostereo_ = value("nostereo", false);
-	downsize_ = value("scale", 1.0f);
 
 	// Use cameras
 	camera_a_ = new VideoCapture;
 	LOG(INFO) << "Cameras check... ";
-	camera_a_->open((flip_) ? 1 : 0);
+	camera_a_->open(0);
 
 	if (!nostereo_) {
-		camera_b_ = new VideoCapture((flip_) ? 0 : 1);
+		camera_b_ = new VideoCapture(1);
 	} else {
 		camera_b_ = nullptr;
 	}
@@ -82,26 +71,18 @@ LocalSource::LocalSource(nlohmann::json &config)
 		stereo_ = true;
 	}
 
-	tps_ = 1.0 / value("max_fps", 25.0);
+	// Allocate page locked host memory for fast GPU transfer
+	left_hm_ = cv::cuda::HostMem(height_, width_, CV_8UC3);
+	right_hm_ = cv::cuda::HostMem(height_, width_, CV_8UC3);
 }
 
 LocalSource::LocalSource(nlohmann::json &config, const string &vid)
 	:	Configurable(config), timestamp_(0.0) {
 
-	REQUIRED({
-		{"flip","Switch left and right views","boolean"},
-		{"flip_vert","Rotate image 180 degrees","boolean"},
-		{"nostereo","Force single camera mode","boolean"},
-		{"width","Pixel width of camera source","number"},
-		{"height","Pixel height of camera source","number"},
-		{"max_fps","Maximum frames per second","number"},
-		{"scale","Change the input image or video scaling","number"}
-	});
-
-	flip_ = value("flip", false);
-	flip_v_ = value("flip_vert", false);
+	//flip_ = value("flip", false);
+	//flip_v_ = value("flip_vert", false);
 	nostereo_ = value("nostereo", false);
-	downsize_ = value("scale", 1.0f);
+	//downsize_ = value("scale", 1.0f);
 
 	if (vid == "") {
 		LOG(FATAL) << "No video file specified";
@@ -138,10 +119,14 @@ LocalSource::LocalSource(nlohmann::json &config, const string &vid)
 		stereo_ = false;
 	}
 
-	tps_ = 1.0 / value("max_fps", 25.0);
+	// Allocate page locked host memory for fast GPU transfer
+	left_hm_ = cv::cuda::HostMem(height_, width_, CV_8UC3);
+	right_hm_ = cv::cuda::HostMem(height_, width_, CV_8UC3);
+
+	//tps_ = 1.0 / value("max_fps", 25.0);
 }
 
-bool LocalSource::left(cv::Mat &l) {
+/*bool LocalSource::left(cv::Mat &l) {
 	if (!camera_a_) return false;
 
 	if (!camera_a_->grab()) {
@@ -174,9 +159,9 @@ bool LocalSource::left(cv::Mat &l) {
 	}
 
 	return true;
-}
+}*/
 
-bool LocalSource::right(cv::Mat &r) {
+/*bool LocalSource::right(cv::Mat &r) {
 	if (!camera_a_->grab()) {
 		LOG(ERROR) << "Unable to grab from camera A";
 		return false;
@@ -212,10 +197,15 @@ bool LocalSource::right(cv::Mat &r) {
 	}
 
 	return true;
-}
+}*/
 
-bool LocalSource::get(cv::cuda::GpuMat &l_out, cv::cuda::GpuMat &r_out, cv::cuda::Stream &stream) {
+bool LocalSource::get(cv::cuda::GpuMat &l_out, cv::cuda::GpuMat &r_out, Calibrate *c, cv::cuda::Stream &stream) {
 	Mat l, r;
+
+	// Use page locked memory
+	l = left_hm_.createMatHeader();
+	r = right_hm_.createMatHeader();
+
 	if (!camera_a_) return false;
 
 	if (!camera_a_->grab()) {
@@ -239,7 +229,7 @@ bool LocalSource::get(cv::cuda::GpuMat &l_out, cv::cuda::GpuMat &r_out, cv::cuda
 	timestamp_ = timestamp;
 
 	if (camera_b_ || !stereo_) {
-		if (!camera_a_->retrieve(left_)) {
+		if (!camera_a_->retrieve(l)) {
 			LOG(ERROR) << "Unable to read frame from camera A";
 			return false;
 		}
@@ -255,23 +245,23 @@ bool LocalSource::get(cv::cuda::GpuMat &l_out, cv::cuda::GpuMat &r_out, cv::cuda
 		}
 
 		int resx = frame.cols / 2;
-		if (flip_) {
-			r = Mat(frame, Rect(0, 0, resx, frame.rows));
-			left_ = Mat(frame, Rect(resx, 0, frame.cols-resx, frame.rows));
-		} else {
-			left_ = Mat(frame, Rect(0, 0, resx, frame.rows));
+		//if (flip_) {
+		//	r = Mat(frame, Rect(0, 0, resx, frame.rows));
+		//	l = Mat(frame, Rect(resx, 0, frame.cols-resx, frame.rows));
+		//} else {
+			l = Mat(frame, Rect(0, 0, resx, frame.rows));
 			r = Mat(frame, Rect(resx, 0, frame.cols-resx, frame.rows));
-		}
+		//}
 	}
 
-	if (downsize_ != 1.0f) {
+	/*if (downsize_ != 1.0f) {
 		// cv::cuda::resize()
 
 		cv::resize(left_, left_, cv::Size((int)(left_.cols * downsize_), (int)(left_.rows * downsize_)),
 				0, 0, cv::INTER_LINEAR);
 		cv::resize(r, r, cv::Size((int)(r.cols * downsize_), (int)(r.rows * downsize_)),
 				0, 0, cv::INTER_LINEAR);
-	}
+	}*/
 
 	// Note: this seems to be too slow on CPU...
 	/*cv::Ptr<cv::xphoto::WhiteBalancer> wb;
@@ -279,15 +269,17 @@ bool LocalSource::get(cv::cuda::GpuMat &l_out, cv::cuda::GpuMat &r_out, cv::cuda
 	wb->balanceWhite(l, l);
 	wb->balanceWhite(r, r);*/
 
-	if (flip_v_) {
+	/*if (flip_v_) {
 		Mat tl, tr;
 		cv::flip(left_, tl, 0);
 		cv::flip(r, tr, 0);
 		left_ = tl;
 		r = tr;
-	}
+	}*/
+
+	c->rectifyStereo(l, r);
 
-	l_out.upload(left_, stream);
+	l_out.upload(l, stream);
 	r_out.upload(r, stream);
 
 	return true;
diff --git a/components/rgbd-sources/src/local.hpp b/components/rgbd-sources/src/local.hpp
index e3fcb91bd585d8090cfb50a8ee83bf49dd78f98d..4fd71c5eede52acc480f8915c3370046aaccfd76 100644
--- a/components/rgbd-sources/src/local.hpp
+++ b/components/rgbd-sources/src/local.hpp
@@ -15,19 +15,19 @@ namespace ftl {
 namespace rgbd {
 namespace detail {
 
+class Calibrate;
+
 class LocalSource : public Configurable {
 	public:
 	explicit LocalSource(nlohmann::json &config);
 	LocalSource(nlohmann::json &config, const std::string &vid);
 	
-	bool left(cv::Mat &m);
-	bool right(cv::Mat &m);
-	bool get(cv::cuda::GpuMat &l, cv::cuda::GpuMat &r, cv::cuda::Stream &stream);
+	//bool left(cv::Mat &m);
+	//bool right(cv::Mat &m);
+	bool get(cv::cuda::GpuMat &l, cv::cuda::GpuMat &r, Calibrate *c, cv::cuda::Stream &stream);
 
 	unsigned int width() const { return width_; }
 	unsigned int height() const { return height_; }
-
-	cv::Mat &cachedLeft() { return left_; }
 	
 	//void setFramerate(float fps);
 	//float getFramerate() const;
@@ -38,18 +38,20 @@ class LocalSource : public Configurable {
 	
 	private:
 	double timestamp_;
-	double tps_;
+	//double tps_;
 	bool stereo_;
 	//float fps_;
-	bool flip_;
-	bool flip_v_;
+	//bool flip_;
+	//bool flip_v_;
 	bool nostereo_;
-	float downsize_;
+	//float downsize_;
 	cv::VideoCapture *camera_a_;
 	cv::VideoCapture *camera_b_;
 	unsigned int width_;
 	unsigned int height_;
-	cv::Mat left_;
+
+	cv::cuda::HostMem left_hm_;
+	cv::cuda::HostMem right_hm_;
 };
 
 }
diff --git a/components/rgbd-sources/src/net.cpp b/components/rgbd-sources/src/net.cpp
index 135395c57a1bc1f9983308719e88f7f49d5df8cf..12a76cc40d38f876f11cbc4919cf7bcb13a4bee5 100644
--- a/components/rgbd-sources/src/net.cpp
+++ b/components/rgbd-sources/src/net.cpp
@@ -161,7 +161,7 @@ void NetSource::_recvChunk(int64_t frame, int chunk, bool delta, const vector<un
 					depth_ = d_depth_;
 					d_depth_ = tmp;
 
-					timestamp_ = current_frame_*40;  // FIXME: Don't hardcode 40ms
+					timestamp_ = current_frame_;
 					current_frame_ = frame;
 				}
 
@@ -277,13 +277,6 @@ void NetSource::_updateURI() {
 
 		N_ = 0;
 
-		// Initiate stream with request for first 10 frames
-		//try {
-		//	host_->getNet()->send(peer_, "get_stream", *uri, N_, 0, host_->getNet()->id(), *uri);
-		//} catch(...) {
-		//	LOG(ERROR) << "Could not connect to stream " << *uri;
-		//}
-
 		// Update chunk details
 		chunks_dim_ = ftl::rgbd::kChunkDim;
 		chunk_width_ = params_.width / chunks_dim_;
diff --git a/components/rgbd-sources/src/stereovideo.cpp b/components/rgbd-sources/src/stereovideo.cpp
index 0d20780c275a43a2338fd9d81a6a98f13619c3a2..d83fdb194a426659b33166aef64d30d367653a0e 100644
--- a/components/rgbd-sources/src/stereovideo.cpp
+++ b/components/rgbd-sources/src/stereovideo.cpp
@@ -154,7 +154,7 @@ static void disparityToDepth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat
 }
 
 bool StereoVideoSource::capture() {
-	lsrc_->get(cap_left_, cap_right_, stream2_);
+	lsrc_->get(cap_left_, cap_right_, calib_, stream2_);
 	stream2_.waitForCompletion();
 	return true;
 }
@@ -178,7 +178,7 @@ bool StereoVideoSource::compute(int n, int b) {
 		//lsrc_->get(left_, right_, stream_);
 		if (depth_tmp_.empty()) depth_tmp_ = cv::cuda::GpuMat(left_.size(), CV_32FC1);
 		if (disp_tmp_.empty()) disp_tmp_ = cv::cuda::GpuMat(left_.size(), CV_32FC1);
-		calib_->rectifyStereo(left_, right_, stream_);
+		//calib_->rectifyStereo(left_, right_, stream_);
 		disp_->compute(left_, right_, disp_tmp_, stream_);
 		ftl::cuda::disparity_to_depth(disp_tmp_, depth_tmp_, params_, stream_);
 		left_.download(rgb_, stream_);
@@ -188,13 +188,13 @@ bool StereoVideoSource::compute(int n, int b) {
 		stream_.waitForCompletion();  // TODO:(Nick) Move to getFrames
 	} else if (chan == ftl::rgbd::kChanRight) {
 		//lsrc_->get(left_, right_, stream_);
-		calib_->rectifyStereo(left_, right_, stream_);
+		//calib_->rectifyStereo(left_, right_, stream_);
 		left_.download(rgb_, stream_);
 		right_.download(depth_, stream_);
 		stream_.waitForCompletion();  // TODO:(Nick) Move to getFrames
 	} else {
 		//lsrc_->get(left_, right_, stream_);
-		calib_->rectifyStereo(left_, right_, stream_);
+		//calib_->rectifyStereo(left_, right_, stream_);
 		//rgb_ = lsrc_->cachedLeft();
 		left_.download(rgb_, stream_);
 		stream_.waitForCompletion();  // TODO:(Nick) Move to getFrames
diff --git a/components/rgbd-sources/src/streamer.cpp b/components/rgbd-sources/src/streamer.cpp
index 3fd69afe3fd6568f16ed9a9c9b489fde36bdc64a..ab03f2a6527163d3413e7a1d2ada8aed9803f07f 100644
--- a/components/rgbd-sources/src/streamer.cpp
+++ b/components/rgbd-sources/src/streamer.cpp
@@ -4,6 +4,7 @@
 #include <thread>
 #include <chrono>
 #include <tuple>
+#include <algorithm>
 
 #include "bitrate_settings.hpp"
 
@@ -31,6 +32,9 @@ Streamer::Streamer(nlohmann::json &config, Universe *net)
 	net_ = net;
 	time_peer_ = ftl::UUID(0);
 	clock_adjust_ = 0;
+	mspf_ = 1000 / value("fps", 25);
+	//last_dropped_ = 0;
+	//drop_count_ = 0;
 
 	compress_level_ = value("compression", 1);
 	
@@ -199,42 +203,43 @@ void Streamer::remove(const std::string &) {
 
 }
 
+void Streamer::_decideFrameRate(int64_t framesdropped, int64_t msremainder) {
+	actual_fps_ = 1000.0f / (float)((framesdropped+1)*mspf_+(msremainder));
+	LOG(INFO) << "Actual FPS = " << actual_fps_;
+
+	/*if (framesdropped > 0) {
+		// If N consecutive frames are dropped, work out new rate
+		if (last_dropped_/mspf_ >= last_frame_/mspf_ - 2*framesdropped) drop_count_++;
+		else drop_count_ = 0;
+
+		last_dropped_ = last_frame_+mspf_;
+
+		if (drop_count_ >= ftl::rgbd::detail::kFrameDropLimit) {
+			drop_count_ = 0;
+
+			const int64_t actualmspf = std::min((int64_t)1000, framesdropped*mspf_+(mspf_ - msremainder));
+
+			LOG(WARNING) << "Suggest FPS @ " << (1000 / actualmspf);
+			//mspf_ = actualmspf;
+
+			// Also notify all clients of change
+		}
+	} else {
+		// Perhaps we can boost framerate?
+		const int64_t actualmspf = std::min((int64_t)1000, framesdropped*mspf_+(mspf_ - msremainder));
+		LOG(INFO) << "Boost framerate: " << (1000 / actualmspf);
+		//mspf_ = actualmspf;
+	}*/
+}
+
 void Streamer::stop() {
 	active_ = false;
 	wait();
 }
 
 void Streamer::poll() {
-	//double wait = 1.0f / 25.0f;  // TODO:(Nick) Should be in config
-	//auto start = std::chrono::high_resolution_clock::now();
-	//int64_t now = std::chrono::time_point_cast<std::chrono::milliseconds>(start).time_since_epoch().count()+clock_adjust_;
-
-	//int64_t msdelay = 40 - (now % 40);
-	//while (msdelay >= 20) {
-	//	sleep_for(milliseconds(10));
-	//	now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
-	//	msdelay = 40 - (now % 40);
-	//}
-	//LOG(INFO) << "Required Delay: " << (now / 40) << " = " << msdelay;
-
 	// Create frame jobs at correct FPS interval
 	_schedule();
-	//std::function<void(int)> j = ftl::pool.pop();
-	//if (j) j(-1);
-
-	//std::chrono::duration<double> elapsed =
-	//	std::chrono::high_resolution_clock::now() - start;
-
-	//if (elapsed.count() >= wait) {
-		//LOG(WARNING) << "Frame rate below optimal @ " << (1.0f / elapsed.count());
-	//} else {
-		//LOG(INFO) << "Frame rate @ " << (1.0f / elapsed.count());
-		// Otherwise, wait until next frame should start.
-		// FIXME:(Nick) Is this accurate enough? Almost certainly not
-		// TODO:(Nick) Synchronise by time corrections and use of fixed time points
-		// but this only works if framerate can be achieved.
-		//sleep_for(milliseconds((long long)((wait - elapsed.count()) * 1000.0f)));
-	//}
 }
 
 void Streamer::run(bool block) {
@@ -304,131 +309,138 @@ void Streamer::wait() {
 	frame_no_ = last_frame_;
 }
 
-void Streamer::_schedule() {
-	wait();
-	//std::mutex job_mtx;
-	//std::condition_variable job_cv;
-	//int jobs = 0;
+void Streamer::_schedule(StreamSource *src) {
+	// There will be two jobs for this source...
+	//UNIQUE_LOCK(job_mtx_,lk);
+	jobs_ += 2 + kChunkCount;
+	//lk.unlock();
 
-	//auto now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
-	//LOG(INFO) << "Frame time = " << (now-(last_frame_*40)) << "ms";
+	//StreamSource *src = sources_[uri];
+	if (src == nullptr || src->jobs != 0) return;
+	src->jobs = 2 + kChunkCount;
 
-	// Prevent new clients during processing.
-	SHARED_LOCK(mutex_,slk);
+	// Grab / capture job
+	ftl::pool.push([this,src](int id) {
+		//auto start = std::chrono::high_resolution_clock::now();
 
-	for (auto s : sources_) {
-		string uri = s.first;
-
-		// No point in doing work if no clients
-		if (s.second->clientCount == 0) {
-			continue;
-		}
-
-		// There will be two jobs for this source...
-		//UNIQUE_LOCK(job_mtx_,lk);
-		jobs_ += 2 + kChunkCount;
-		//lk.unlock();
+		auto start = std::chrono::high_resolution_clock::now();
+		int64_t now = std::chrono::time_point_cast<std::chrono::milliseconds>(start).time_since_epoch().count()+clock_adjust_;
+		int64_t target = now / mspf_;
+		int64_t msdelay = mspf_ - (now % mspf_);
 
-		StreamSource *src = sources_[uri];
-		if (src == nullptr || src->jobs != 0) continue;
-		src->jobs = 2 + kChunkCount;
+		if (target != last_frame_ && msdelay != mspf_) LOG(WARNING) << "Frame " << "(" << (target-last_frame_) << ") dropped by " << (now%mspf_) << "ms";
 
-		// Grab / capture job
-		ftl::pool.push([this,src](int id) {
-			//auto start = std::chrono::high_resolution_clock::now();
-
-			auto start = std::chrono::high_resolution_clock::now();
-			int64_t now = std::chrono::time_point_cast<std::chrono::milliseconds>(start).time_since_epoch().count()+clock_adjust_;
-			int64_t target = now / 40;
-
-			// TODO:(Nick) A now%40 == 0 should be accepted
-			if (target != last_frame_) LOG(WARNING) << "Frame " << "(" << (target-last_frame_) << ") dropped by " << (now%40) << "ms";
+		// Use sleep_for for larger delays
+		
+		//LOG(INFO) << "Required Delay: " << (now / 40) << " = " << msdelay;
+		while (msdelay >= 20 && msdelay < mspf_) {
+			sleep_for(milliseconds(10));
+			now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
+			msdelay = mspf_ - (now % mspf_);
+		}
 
-			// Use sleep_for for larger delays
-			int64_t msdelay = 40 - (now % 40);
-			//LOG(INFO) << "Required Delay: " << (now / 40) << " = " << msdelay;
-			while (msdelay >= 20) {
-				sleep_for(milliseconds(10));
-				now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
-				msdelay = 40 - (now % 40);
-			}
+		// Spin loop until exact grab time
+		//LOG(INFO) << "Spin Delay: " << (now / 40) << " = " << (40 - (now%40));
 
-			// Spin loop until exact grab time
-			//LOG(INFO) << "Spin Delay: " << (now / 40) << " = " << (40 - (now%40));
-			target = now / 40;
-			while ((now/40) == target) {
+		if (msdelay != mspf_) {
+			target = now / mspf_;
+			while ((now/mspf_) == target) {
 				_mm_pause();  // SSE2 nano pause intrinsic
 				now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
 			};
-			last_frame_ = now/40;
+		}
+		last_frame_ = now/mspf_;
 
-			try {
-				src->src->capture();
-			} catch (std::exception &ex) {
-				LOG(ERROR) << "Exception when grabbing frame";
-				LOG(ERROR) << ex.what();
-			}
-			catch (...) {
-				LOG(ERROR) << "Unknown exception when grabbing frame";
-			}
+		try {
+			src->src->capture();
+		} catch (std::exception &ex) {
+			LOG(ERROR) << "Exception when grabbing frame";
+			LOG(ERROR) << ex.what();
+		}
+		catch (...) {
+			LOG(ERROR) << "Unknown exception when grabbing frame";
+		}
 
-			//now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
-			//if (now%40 > 0) LOG(INFO) << "Grab in: " << (now%40) << "ms";
+		//now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
+		//if (now%40 > 0) LOG(INFO) << "Grab in: " << (now%40) << "ms";
 
-			//std::chrono::duration<double> elapsed =
-			//	std::chrono::high_resolution_clock::now() - start;
-			//LOG(INFO) << "Grab in " << elapsed.count() << "s";
+		//std::chrono::duration<double> elapsed =
+		//	std::chrono::high_resolution_clock::now() - start;
+		//LOG(INFO) << "Grab in " << elapsed.count() << "s";
 
-			src->jobs--;
-			_swap(src);
+		src->jobs--;
+		_swap(src);
 
-			// Mark job as finished
-			std::unique_lock<std::mutex> lk(job_mtx_);
-			--jobs_;
-			job_cv_.notify_one();
-		});
+		// Mark job as finished
+		std::unique_lock<std::mutex> lk(job_mtx_);
+		--jobs_;
+		job_cv_.notify_one();
+	});
+
+	// Compute job
+	ftl::pool.push([this,src](int id) {
+		try {
+			src->src->compute();
+		} catch (std::exception &ex) {
+			LOG(ERROR) << "Exception when computing frame";
+			LOG(ERROR) << ex.what();
+		}
+		catch (...) {
+			LOG(ERROR) << "Unknown exception when computing frame";
+		}
+
+		src->jobs--;
+		_swap(src);
+
+		// Mark job as finished
+		std::unique_lock<std::mutex> lk(job_mtx_);
+		--jobs_;
+		job_cv_.notify_one();
+	});
 
-		// Compute job
-		ftl::pool.push([this,src](int id) {
+	// Create jobs for each chunk
+	for (int i=0; i<kChunkCount; ++i) {
+		// Add chunk job to thread pool
+		ftl::pool.push([this,src,i](int id) {
+			int chunk = i;
 			try {
-				src->src->compute();
-			} catch (std::exception &ex) {
-				LOG(ERROR) << "Exception when computing frame";
-				LOG(ERROR) << ex.what();
+			if (!src->rgb.empty() && (src->src->getChannel() == ftl::rgbd::kChanNone || !src->depth.empty())) {
+				_encodeAndTransmit(src, chunk);
 			}
-			catch (...) {
-				LOG(ERROR) << "Unknown exception when computing frame";
+			} catch(...) {
+				LOG(ERROR) << "Encode Exception: " << chunk;
 			}
 
 			src->jobs--;
 			_swap(src);
-
-			// Mark job as finished
 			std::unique_lock<std::mutex> lk(job_mtx_);
 			--jobs_;
 			job_cv_.notify_one();
 		});
+	}
+}
 
-		// Create jobs for each chunk
-		for (int i=0; i<kChunkCount; ++i) {
-			// Add chunk job to thread pool
-			ftl::pool.push([this,src,i](int id) {
-				int chunk = i;
-				try {
-				if (!src->rgb.empty() && (src->src->getChannel() == ftl::rgbd::kChanNone || !src->depth.empty())) {
-					_encodeAndTransmit(src, chunk);
-				}
-				} catch(...) {
-					LOG(ERROR) << "Encode Exception: " << chunk;
-				}
+void Streamer::_schedule() {
+	wait();
+	//std::mutex job_mtx;
+	//std::condition_variable job_cv;
+	//int jobs = 0;
 
-				src->jobs--;
-				_swap(src);
-				std::unique_lock<std::mutex> lk(job_mtx_);
-				--jobs_;
-				job_cv_.notify_one();
-			});
+	//auto now = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()).time_since_epoch().count()+clock_adjust_;
+	//LOG(INFO) << "Frame time = " << (now-(last_frame_*40)) << "ms";
+
+	// Prevent new clients during processing.
+	SHARED_LOCK(mutex_,slk);
+
+	for (auto s : sources_) {
+		string uri = s.first;
+
+		// No point in doing work if no clients
+		if (s.second->clientCount == 0) {
+			continue;
 		}
+
+		_schedule(s.second);
 	}
 }
 
@@ -490,8 +502,8 @@ void Streamer::_encodeAndTransmit(StreamSource *src, int chunk) {
 		auto c = src->clients[b].begin();
 		while (c != src->clients[b].end()) {
 			try {
-				// TODO:(Nick) Send pose and timestamp
-				if (!net_->send((*c).peerid, (*c).uri, frame_no_, chunk, delta, rgb_buf, d_buf)) {
+				// TODO:(Nick) Send pose
+				if (!net_->send((*c).peerid, (*c).uri, frame_no_*mspf_, chunk, delta, rgb_buf, d_buf)) {
 					// Send failed so mark as client stream completed
 					(*c).txcount = (*c).txmax;
 				} else {