diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index 4ee248dc306c47784b9284f3975d9caee8d267cc..892ba32e84e35c99c6d89bb9a1253e30b0ecf675 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -242,6 +242,31 @@ void ftl::gui::Camera::setChannel(ftl::rgbd::channel_t c) {
 	}
 }
 
+static void visualizeDepthMap(	const cv::Mat &depth, cv::Mat &out,
+								const float max_depth)
+{
+	DCHECK(max_depth > 0.0);
+
+	depth.convertTo(out, CV_8U, 255.0f / max_depth);
+	out = 255 - out;
+	cv::Mat mask = (depth >= 39.0f); // TODO (mask for invalid pixels)
+	
+	applyColorMap(out, out, cv::COLORMAP_JET);
+	out.setTo(cv::Scalar(255, 255, 255), mask);
+}
+
+static void drawEdges(	const cv::Mat &in, cv::Mat &out,
+						const int ksize = 3, double weight = -1.0, const int threshold = 32,
+						const int threshold_type = cv::THRESH_TOZERO)
+{
+	cv::Mat edges;
+	cv::Laplacian(in, edges, 8, ksize);
+	cv::threshold(edges, edges, threshold, 255, threshold_type);
+
+	cv::Mat edges_color(in.size(), CV_8UC3);
+	cv::addWeighted(edges, weight, out, 1.0, 0.0, out, CV_8UC3);
+}
+
 const GLTexture &ftl::gui::Camera::captureFrame() {
 	float now = (float)glfwGetTime();
 	delta_ = now - ftime_;
@@ -280,10 +305,8 @@ const GLTexture &ftl::gui::Camera::captureFrame() {
 		switch(channel_) {
 			case ftl::rgbd::kChanDepth:
 				if (depth.rows == 0) { break; }
-				//imageSize = Vector2f(depth.cols,depth.rows);
-				depth.convertTo(tmp, CV_8U, 255.0f / 5.0f);
-				tmp = 255 - tmp;
-				applyColorMap(tmp, tmp, cv::COLORMAP_JET);
+				visualizeDepthMap(depth, tmp, 7.0);
+				drawEdges(rgb, tmp);
 				texture_.update(tmp);
 				break;
 			
diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt
index ae06aab6d142ba8a2ad3a1e0970905e3a70468e9..1e55c671c87487b995b0dd772495ff8a612a8c78 100644
--- a/applications/reconstruct/CMakeLists.txt
+++ b/applications/reconstruct/CMakeLists.txt
@@ -10,7 +10,7 @@ set(REPSRC
 	src/garbage.cu
 	src/integrators.cu
 	#src/ray_cast_sdf.cu
-	src/splat_render.cu
+	src/voxel_render.cu
 	src/camera_util.cu
 	src/voxel_hash.cu
 	src/voxel_hash.cpp
@@ -19,6 +19,7 @@ set(REPSRC
 	#src/virtual_source.cpp
 	src/splat_render.cpp
 	src/dibr.cu
+	src/mls.cu
 	src/depth_camera.cu
 	src/depth_camera.cpp
 )
diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp
index b7a3b44426bf6fcf490af67f38461ca849ade428..ae85cfbd785fb72f79225ae28278bf4862b680db 100644
--- a/applications/reconstruct/include/ftl/cuda_operators.hpp
+++ b/applications/reconstruct/include/ftl/cuda_operators.hpp
@@ -406,6 +406,12 @@ inline __host__ __device__ float length(float3 v)
     return sqrtf(dot(v, v));
 }
 
+// length squared
+inline __host__ __device__ float length2(const float3 &v)
+{
+    return dot(v, v);
+}
+
 // normalize
 inline __host__ __device__ float3 normalize(float3 v)
 {
diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu
index 9a36ea452225c1a174d0ba6ced0baf98471688eb..7a322eaf733230772e72c50722e849335bf7e891 100644
--- a/applications/reconstruct/src/depth_camera.cu
+++ b/applications/reconstruct/src/depth_camera.cu
@@ -47,6 +47,21 @@ void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStre
 	clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_to_zero_kernel(ftl::cuda::TextureObject<float> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = 0.0f; //PINF;
+	}
+}
+
+void ftl::cuda::clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_to_zero_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 __global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -79,6 +94,21 @@ void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cuda
 	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
 }
 
+__global__ void clear_colour_kernel(ftl::cuda::TextureObject<float4> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = make_float4(0.0f);
+	}
+}
+
+void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
 // ===== Type convert =====
 
 template <typename A, typename B>
@@ -103,331 +133,6 @@ void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda:
 	convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale);
 }
 
-/// ===== MLS Smooth
-
-// TODO:(Nick) Put this in a common location (used in integrators.cu)
-extern __device__ float spatialWeighting(float r);
-extern __device__ float spatialWeighting(float r, float h);
-
-/*
- * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
- * Real-time foreground-background segmentation using codebook model.
- * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
- */
- __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
-	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
-	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
-	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
-	const float p_2 = xv_2 / v_2;
-	return sqrt(x_2 - p_2);
-}
-
-__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
-	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
-	return length(delta);
-}
-
-/*
- * Colour weighting as suggested in:
- * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
- * c = colour distance
- */
- __device__ float colourWeighting(float c) {
-	const float h = c_hashParams.m_colourSmoothing;
-	if (c >= h) return 0.0f;
-	float ch = c / h;
-	ch = 1.0f - ch*ch;
-	return ch*ch*ch*ch;
-}
-
-#define WINDOW_RADIUS 5
-
-__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > 0.0f) {
-						wpos += weight* (camera.pose * camPos);
-						weights += weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-
-				// TODO:(Nick) dot product of normals < 0 means the point
-				// should be ignored with a weight of 0 since it is facing the wrong direction
-				// May be good to simply weight using the dot product to give
-				// a stronger weight to those whose normals are closer
-
-				float weight = spatialWeighting(length(pf - camPos), h);
-
-				if (weight > 0.0f) {
-					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
-					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
-
-						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-
-						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
-							pos += weight*camPos; // (camera.pose * camPos);
-							weights += weight;
-						}
-					}
-				}			
-			//}
-		}
-	}
-
-	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
-
-	return weights;
-}
-
-__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-	//#pragma unroll
-	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
-		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
-				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
-				float weight = spatialWeighting(length(pf - camPos));
-
-				if (weight > 0.0f) {
-					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
-					weight *= colourWeighting(colordiffFloat2(c1,c2));
-
-					if (weight > weights) {
-						pos = weight* (camera.pose * camPos);
-						weights = weight;
-					}
-				}			
-			//}
-		}
-	}
-
-	wpos += pos;
-	//wpos += (camera.pose * pos);
-
-	return weights;
-}
-
-__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
-	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
-
-	const float3 pf = camera.poseInverse * mPos;
-	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
-	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
-	float weights = 0.0f;
-
-
-	//float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y);
-	const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y));
-	if (worldPos.z == MINF) return 0.0f;
-
-	float weight = spatialWeighting(length(mPos - worldPos));
-
-	if (weight > 0.0f) {
-		wpos += weight* (worldPos);
-		weights += weight;
-	}
-
-	return weights;
-}
-
-__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = output.width();
-	const int height = output.height();
-
-	const DepthCameraCUDA &mainCamera = c_cameras[cam];
-
-	if (x < width && y < height) {
-
-		const float depth = tex2D<float>(mainCamera.depth, x, y);
-		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
-		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
-		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
-
-		float3 wpos = make_float3(0.0f);
-		float3 wnorm = make_float3(0.0f);
-		float weights = 0.0f;
-
-		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
-			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
-
-			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
-					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
-					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
-
-				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
-					for (uint cam2=0; cam2<numcams; ++cam2) {
-						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
-						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
-
-						// Previous approach
-						//if (cam2 == cam) continue;
-						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
-					}
-					wpos /= weights;
-				} else {
-					weights = 1000.0f;
-					wpos = mPos;
-				} 
-
-				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-
-				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
-
-				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
-				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
-				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
-				//}
-			}
-		}
-	}
-}
-
-void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
-	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-#define RESAMPLE_RADIUS 7
-
-__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-
-	const int width = depthin.width();
-	const int height = depthin.height();
-
-	if (x < width && y < height) {
-
-		//const int depth = depthin.tex2D((int)x, (int)y);
-		//if (depth != 0x7FFFFFFF) {
-		//	depthout(x,y) = (float)depth / 1000.0f;
-		//	return;
-		//}
-
-		struct map_t {
-			int d;
-			int quad;
-		};
-
-		map_t mappings[5];
-		int mapidx = 0;
-
-		for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) {
-			for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) {
-
-				const int depth = depthin.tex2D((int)x+u, (int)y+v);
-				const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v);
-				
-				if (depth != 0x7FFFFFFF) {
-					int i=0;
-					for (i=0; i<mapidx; ++i) {
-						if (abs(mappings[i].d - depth) < 100) {
-							if (u < 0 && v < 0) mappings[i].quad |= 0x1;
-							if (u > 0 && v < 0) mappings[i].quad |= 0x2;
-							if (u > 0 && v > 0) mappings[i].quad |= 0x4;
-							if (u < 0 && v > 0) mappings[i].quad |= 0x8;
-							break;
-						}
-					}
-					if (i == mapidx && i < 5) {
-						mappings[mapidx].d = depth;
-						mappings[mapidx].quad = 0;
-						if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1;
-						if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2;
-						if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4;
-						if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8;
-						++mapidx;
-					} else {
-						//printf("EXCEEDED\n");
-					}
-				}
-			}
-		}
-
-		int bestdepth = 1000000;
-		//int count = 0;
-		for (int i=0; i<mapidx; ++i) {
-			if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
-			//if (mappings[i].quad == 15) count ++;
-		}
-
-		//depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f;
-
-		if (bestdepth < 1000000) {
-			depthout(x,y) = (float)bestdepth / 1000.0f;
-		}
-	}
-}
-
-void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams &params, cudaStream_t stream) {
-	const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params);
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
-
-
 /// ===== Median Filter ======
 
 #define WINDOW_SIZE 3
diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp
index 552c2e59d44206c089fee68124ca814d3e752ad6..26bfcad73f5ae72f20cfe2dd29d0e91c62fca62b 100644
--- a/applications/reconstruct/src/depth_camera_cuda.hpp
+++ b/applications/reconstruct/src/depth_camera_cuda.hpp
@@ -10,8 +10,10 @@ namespace cuda {
 
 void clear_depth(const TextureObject<float> &depth, cudaStream_t stream);
 void clear_depth(const TextureObject<int> &depth, cudaStream_t stream);
+void clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream);
 void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream);
+void clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
 
 void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream);
 
@@ -21,7 +23,7 @@ void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureO
 
 void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream);
 
-void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams &params, cudaStream_t stream);
+void mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const ftl::render::SplatParams &params, int numcams, cudaStream_t stream);
 
 void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams &params, cudaStream_t stream);
 
diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu
index 9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f..acb86216a7eecd6e10f3ebf3a3b999964b56cf09 100644
--- a/applications/reconstruct/src/dibr.cu
+++ b/applications/reconstruct/src/dibr.cu
@@ -1,12 +1,20 @@
 #include "splat_render_cuda.hpp"
+#include "depth_camera_cuda.hpp"
 #include <cuda_runtime.h>
 
 #include <ftl/cuda_matrix_util.hpp>
 
 #include "splat_params.hpp"
+#include "mls_cuda.hpp"
 #include <ftl/depth_camera.hpp>
 
 #define T_PER_BLOCK 8
+#define UPSAMPLE_FACTOR 1.8f
+#define WARP_SIZE 32
+#define DEPTH_THRESHOLD 0.05f
+#define UPSAMPLE_MAX 60
+#define MAX_ITERATIONS 10
+#define SPATIAL_SMOOTHING 0.01f
 
 using ftl::cuda::TextureObject;
 using ftl::render::SplatParams;
@@ -23,55 +31,343 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) {
 	}
 }
 
+__device__ inline bool isStable(const float3 &previous, const float3 &estimate, const SplatParams &params, float d) {
+    const float psize = 2.0f * d / params.camera.fx;
+    //printf("PSIZE %f\n", psize);
+    return fabs(previous.x - estimate.x) <= psize &&
+        fabs(previous.y - estimate.y) <= psize &&
+        fabs(previous.z - estimate.z) <= psize;
+}
 
-__global__ void dibr_depthmap_kernel(
-    TextureObject<int> depth, int numcams, SplatParams params) {
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+__global__ void dibr_visibility_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((5.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+    // TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-    const int y = by + threadIdx.y;
-    
-    for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		//float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,camPos.z);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+        float ld = nearest.z;
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+			float3 output;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+                if (ftl::cuda::mls_point_surface<1>(camera.points, make_int2(x,y), params.m_viewMatrixInverse * nearest, output, SPATIAL_SMOOTHING) <= 0.0f) {
+                    /*const unsigned int cx = screenPos.x;
+                    const unsigned int cy = screenPos.y;
+                    if (cx < depth.width() && cy < depth.height()) {
+                        atomicMax(&depth(cx,cy), 10000.0f);
+                    }*/
+                    break;
+                }
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+				output = params.m_viewMatrix * output;
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z + copysignf(0.5f*length(output - nearest), output.z - nearest.z);
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);
+
+                const float2 sp = params.camera.cameraToKinectScreenFloat(output);
+
+                //if (isStable(nearest, output, params, d)) {
+                //if (fabs(sp.x - float(screenPos.x+u)) < 2.0f && fabs(sp.y - float(screenPos.y+v)) < 2.0f) {
+				if (length(output - nearest) <= 2.0f * params.camera.fx / camPos.z) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+                    }
+                    break;
+                }
+
+                /*if (k >= MAX_ITERATIONS-1 && length(output - nearest) <= SPATIAL_SMOOTHING) {
+                    const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+                    if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+						//atomicMin(&depth(cx,cy), d * 1000.0f);
+						printf("ERR = %f, %f\n", fabs(sp.x - float(screenPos.x+u)), fabs(sp.y - float(screenPos.y+v)));
+                    }
+                }*/
+
+                //nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d);  // ld + (d - ld)*0.8f
+                ld = d;
+			}
+		//}
+	}
+}
+
+// ------ Alternative for pass 1: principle surfaces ---------------------------
+
+#define NEIGHBOR_RADIUS 1
+#define MAX_NEIGHBORS ((NEIGHBOR_RADIUS*2+1)*(NEIGHBOR_RADIUS*2+1))
+
+/*
+ * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2.
+ * These values are also used as the actual surface estimate during rendering so should
+ * at least be plane or sphere fitted if not MLS smoothed onto the actual surface.
+ */
+ __global__ void dibr_visibility_principal_kernel(TextureObject<int> depth, int cam, SplatParams params) {
+	__shared__ float3 neighborhood_cache[2*T_PER_BLOCK][MAX_NEIGHBORS];
+	__shared__ int minimum[2*T_PER_BLOCK];
+	__shared__ int maximum[2*T_PER_BLOCK];
+
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2;
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
+
+	// Get virtual camera ray for splat centre and backface cull if possible
+	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
+	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
+	//if (dot(rayDir, normal) > 0.0f) return;
+
+    // Find the virtual screen position of current point
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+
+	const int upsample = min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));
+
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
+            
+	// TODO:(Nick) Check depth buffer and don't do anything if already hidden?
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
-        
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
+	// TODO:(Nick) Preload point neighbors and transform to eye
+	const int lane = threadIdx.x % WARP_SIZE;
+	if (lane == 0) {
+		minimum[warp] = 100000000;
+		maximum[warp] = -100000000;
+	}
+
+	__syncwarp();
+
+	for (int i=lane; i<MAX_NEIGHBORS; i+=WARP_SIZE) {
+		const int u = (i % (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const int v = (i / (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS;
+		const float3 point = params.m_viewMatrix * make_float3(tex2D<float4>(camera.points, x+u, y+v));
+		neighborhood_cache[warp][i] = point;
+
+		if (length(point - camPos) <= 0.04f) {
+			atomicMin(&minimum[warp], point.z*1000.0f);
+			atomicMax(&maximum[warp], point.z*1000.0f);
+		}
+	}
+
+	__syncwarp();
+	
+	const float interval = (float(maximum[warp])/1000.0f - float(minimum[warp]) / 1000.0f) / float(MAX_ITERATIONS);
+	//if (y == 200) printf("interval: %f\n", interval);
+
+	// TODO:(Nick) Find min and max depths of neighbors to estimate z bounds
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Make an initial estimate of the points location
+		// Use centroid depth as estimate...?
+        //float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample);
+
+		// Use current points z as estimate
+		// TODO: Use min point as estimate
+		float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,float(minimum[warp])/1000.0f);
+		
+		// Or calculate upper and lower bounds for depth and do gradient
+		// descent until the gradient change is too small or max iter is reached
+		// and depth remains within the bounds.
+		// How to find min and max depths?
+
+		// TODO: (Nick) Estimate depth using points plane, but needs better normals.
+		//float t;
+		//if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) {
+			// Plane based estimate of surface at this pixel
+			//const float3 nearest = rayOrigin + rayDir * camPos.z;
+
+            // Use MLS of camera neighbor points to get more exact estimate
+            // Iterate until pixel is stable on the surface.
+            for (int k=0; k<MAX_ITERATIONS; ++k) {
+
+                // TODO:(Nick) Should perhaps use points from all cameras?
+                // Instead of doing each camera separately...
+                // If the depth already is close then it has already been done and can skip this point
+				const float energy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS>(neighborhood_cache[warp], nearest, SPATIAL_SMOOTHING);
+				
+				if (energy <= 0.0f) break;
+            
+				//ftl::cuda::render_depth(depth, params, output);
+
+                // This is essentially the SDF function f(x), only the normal should be estimated also from the weights
+                //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z);
+
+				const float d = nearest.z;
+				nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d+interval);
+				
+				if (energy >= 0.1f) {
+					const unsigned int cx = screenPos.x+u;
+                    const unsigned int cy = screenPos.y+v;
+					if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+                        // Transform estimated point to virtual cam space and output z
+                        atomicMin(&depth(cx,cy), d * 1000.0f);
+					}
+					break;
+				}
+			}
+		//}
+	}
+}
+
+__device__ inline float4 make_float4(const uchar4 &c) {
+    return make_float4(c.x,c.y,c.z,c.w);
+}
+
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+__global__ void dibr_attribute_contrib_kernel(
+        TextureObject<int> depth_in,
+        TextureObject<float4> colour_out,
+        TextureObject<float4> normal_out,
+        TextureObject<float> contrib_out, int cam,
+        SplatParams params) {
         
-        const float3 worldPos = make_float3(d);
-        const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+	//const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y));
+	if (worldPos.x == MINF) return;
+    const float r = (camera.poseInverse * worldPos).z / camera.params.fx;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
 
+    const int upsample = min(UPSAMPLE_MAX, int((10.0f*r) * params.camera.fx / camPos.z));
 
-        if (cx < depth.width() && cy < depth.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-           		//colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-            //}
+	// Not on screen so stop now...
+	if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
+            screenPos.x - upsample >= depth_in.width() || screenPos.y - upsample >= depth_in.height()) return;
             
-            atomicMin(&depth(cx,cy), camPos.z * 1000.0f);
+    // Is this point near the actual surface and therefore a contributor?
+    const float d = ((float)depth_in.tex2D((int)screenPos.x, (int)screenPos.y)/1000.0f);
+    if (abs(d - camPos.z) > DEPTH_THRESHOLD) return;
+
+    // TODO:(Nick) Should just one thread load these to shared mem?
+    const float4 colour = make_float4(tex2D<uchar4>(camera.colour, x, y));
+    const float4 normal = tex2D<float4>(camera.normal, x, y);
+
+	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
+	const int lane = threadIdx.x % WARP_SIZE;
+	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+		const float u = (i % upsample) - (upsample / 2);
+		const float v = (i / upsample) - (upsample / 2);
+
+        // Use the depth buffer to determine this pixels 3D position in camera space
+        const float d = ((float)depth_in.tex2D(screenPos.x+u, screenPos.y+v)/1000.0f);
+		const float3 nearest = params.camera.kinectDepthToSkeleton((int)(screenPos.x+u),(int)(screenPos.y+v),d);
+
+        // What is contribution of our current point at this pixel?
+        const float weight = ftl::cuda::spatialWeighting(length(nearest - camPos), SPATIAL_SMOOTHING);
+        if (screenPos.x+u < colour_out.width() && screenPos.y+v < colour_out.height() && weight > 0.0f) {  // TODO: Use confidence threshold here
+            const float4 wcolour = colour * weight;
+            const float4 wnormal = normal * weight;
+
+            // Add this points contribution to the pixel buffer
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v), wcolour.x);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+1, wcolour.y);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+2, wcolour.z);
+            atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+3, wcolour.w);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v), wnormal.x);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+1, wnormal.y);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+2, wnormal.z);
+            atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+3, wnormal.w);
+            atomicAdd(&contrib_out(screenPos.x+u, screenPos.y+v), weight);
         }
-    }
+	}
 }
 
-
-__global__ void dibr_kernel(
+/*
+ * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
+ */
+/*__global__ void dibr_attribute_contrib_kernel(
     TextureObject<int> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
+	TextureObject<uchar4> colour_out,
+	TextureObject<float4> normal_out, int numcams, SplatParams params) {
 
     const int i = threadIdx.y*blockDim.y + threadIdx.x;
     const int bx = blockIdx.x*blockDim.x;
@@ -80,119 +376,89 @@ __global__ void dibr_kernel(
     const int y = by + threadIdx.y;
     
     for (int j=0; j<numcams; ++j) {
-        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+        const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[j];
 	
-		float4 d = tex2D<float4>(camera.points, x, y);
-		if (d.z < 0.0f) continue;
-        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
+        float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y));
+        float r = (camera.poseInverse * worldPos).z;
+        //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), worldPos, 0.02f) < 0.001f) continue;
+        if (worldPos.x == MINF) continue;
         
-        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
-        
-        const float3 worldPos = make_float3(d);
         const float3 camPos = params.m_viewMatrix * worldPos;
-	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
-	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
-
-        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-        const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
+        // Estimate upsample factor using ratio of source depth and output depth
 
+		const int upsample = min(15, (int)(UPSAMPLE_FACTOR * (r / camPos.z))+1);
+		const float upfactor = 2.0f / (float)(upsample);
 
-        if (cx < colour_out.width() && cy < colour_out.height()) {
-            //float camd = depth_in.tex2D((int)cx,(int)cy);
-            //atomicMin(&depth(x,y), idepth);
-            //float camdiff = fabs(camPos.z-camd);
-            //if (camdiff < 0.1f) {
-
-            if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) {
-				   colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-				   //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255);
+        for (int v=0; v<upsample; ++v) {
+            for (int u=0; u<upsample; ++u) {
+                float3 point;
+                const ftl::cuda::fragment nearest = ftl::cuda::upsampled_point(camera.points, camera.normal, camera.colour,
+                    make_float2((float)x-1.0f+u*upfactor,(float)y-1.0f+v*upfactor));
+                //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), nearest, point, 0.02f) < 0.001f) continue;
+                ftl::cuda::render_fragment(depth_in, normal_out, colour_out, params, nearest);
             }
-                   
-
-            //} else {
-				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
         }
     }
-}
-
-__device__ inline float4 make_float4(const uchar4 &c) {
-    return make_float4(c.x,c.y,c.z,c.w);
-}
-
-__global__ void dibr_kernel_rev(
-    TextureObject<float> depth_in,
-    TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
-
-    const int i = threadIdx.y*blockDim.y + threadIdx.x;
-    const int bx = blockIdx.x*blockDim.x;
-    const int by = blockIdx.y*blockDim.y;
-    const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-	
-	float camd = depth_in.tex2D((int)x,(int)y);
-	if (camd < 0.01f)	return;
-	if (camd >= params.camera.m_sensorDepthWorldMax) return;
-	
-	const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd);
-    float mindiff = 1000.0f;
-    float4 col = make_float4(0.0f,0.0f,0.0f,0.0f);
-    int count = 0;
+}*/
 
-    for (int j=0; j<numcams; ++j) {
-		const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
-		
-		const float3 camPos = camera.poseInverse * worldPos;
-	    const float2 screenPosf = camera.params.cameraToKinectScreenFloat(camPos);
-		const uint2 screenPos = make_uint2(make_int2(screenPosf));
-		
-		if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
 
-		const unsigned int cx = screenPos.x;
-        const unsigned int cy = screenPos.y;
 
-        if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) {
-			float d = tex2D<float>(camera.depth, (int)cx, (int)cy);
-            float camdiff = fabs(camPos.z-d);
-            
-            if (camdiff < mindiff) {
-                mindiff = camdiff;
-                col += make_float4(tex2D<uchar4>(camera.colour,cx,cy));
-                ++count;
-            }
+__global__ void dibr_normalise_kernel(
+        TextureObject<float4> colour_in,
+        TextureObject<uchar4> colour_out,
+        TextureObject<float4> normals,
+        TextureObject<float> contribs) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-            //if (camdiff < 0.1f) {
-            //	colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy);
-            //} else {
-				//colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255);
-			//}
-		}
-    }
+	if (x < colour_in.width() && y < colour_in.height()) {
+        const float4 colour = colour_in.tex2D((int)x,(int)y);
+        const float4 normal = normals.tex2D((int)x,(int)y);
+        const float contrib = contribs.tex2D((int)x,(int)y);
 
-    if (count > 0) {
-        col = col / (float)count;
-        colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255);
-    } else {
-        colour_out(x,y) = make_uchar4(76,76,76,255);
-    }
+        if (contrib > 0.0f) {
+            colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0);
+            normals(x,y) = normal / contrib;
+        }
+	}
 }
 
 void ftl::cuda::dibr(const TextureObject<int> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
-
+        const TextureObject<uchar4> &colour_out,
+        const TextureObject<float4> &normal_out,
+        const TextureObject<float> &confidence_out,
+        const TextureObject<float4> &tmp_colour,
+        int numcams,
+        const SplatParams &params,
+        cudaStream_t stream) {
+
+	const dim3 sgridSize((depth_out.width() + 2 - 1)/2, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 sblockSize(2*WARP_SIZE, T_PER_BLOCK);
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+    ftl::cuda::clear_to_zero(confidence_out, stream);
+    ftl::cuda::clear_colour(tmp_colour, stream);
+    ftl::cuda::clear_colour(normal_out, stream);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
-    dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+    int i=3;
+    // Pass 1, merge a depth map from each camera.
+	for (int i=0; i<numcams; ++i)
+        dibr_visibility_principal_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, i, params);
+
+    // Pass 2, accumulate all point contributions to pixels
+    for (int i=0; i<numcams; ++i)
+        dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, tmp_colour, normal_out, confidence_out, i, params);
+
+    // Pass 3, normalise contributions
+    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(tmp_colour, colour_out, normal_out, confidence_out);
+
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
@@ -200,22 +466,21 @@ void ftl::cuda::dibr(const TextureObject<int> &depth_out,
 #endif
 }
 
-void ftl::cuda::dibr(const TextureObject<float> &depth_out,
-    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
+void ftl::cuda::dibr_raw(const TextureObject<int> &depth_out,
+    	int numcams, const SplatParams &params, cudaStream_t stream) {
 
     const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 
-    dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+	//dibr_depthmap_direct_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
 	cudaSafeCall( cudaGetLastError() );
 	
 #ifdef _DEBUG
 	cudaSafeCall(cudaDeviceSynchronize());
 #endif
 }
+
diff --git a/applications/reconstruct/src/mls.cu b/applications/reconstruct/src/mls.cu
new file mode 100644
index 0000000000000000000000000000000000000000..71201a017cf1ebab6e06257e9425403df50427d3
--- /dev/null
+++ b/applications/reconstruct/src/mls.cu
@@ -0,0 +1,273 @@
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include "depth_camera_cuda.hpp"
+
+#include "mls_cuda.hpp"
+
+#define T_PER_BLOCK 16
+#define MINF __int_as_float(0xff800000)
+
+using ftl::voxhash::DepthCameraCUDA;
+using ftl::voxhash::HashData;
+using ftl::voxhash::HashParams;
+using ftl::cuda::TextureObject;
+using ftl::render::SplatParams;
+
+extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+extern __constant__ HashParams c_hashParams;
+
+/// ===== MLS Smooth
+
+/*
+ * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
+ * Real-time foreground-background segmentation using codebook model.
+ * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
+ */
+ __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
+	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
+	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
+	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
+	const float p_2 = xv_2 / v_2;
+	return sqrt(x_2 - p_2);
+}
+
+__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
+	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
+	return length(delta);
+}
+
+/*
+ * Colour weighting as suggested in:
+ * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
+ * c = colour distance
+ */
+ __device__ float colourWeighting(float c) {
+	const float h = c_hashParams.m_colourSmoothing;
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
+#define WINDOW_RADIUS 5
+
+__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), c_hashParams.m_spatialSmoothing);
+
+				if (weight > 0.0f) {
+					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+					weight *= colourWeighting(colordiffFloat2(c1,c2));
+
+					if (weight > 0.0f) {
+						wpos += weight* (camera.pose * camPos);
+						weights += weight;
+					}
+				}			
+			//}
+		}
+	}
+
+	//wpos += (camera.pose * pos);
+
+	return weights;
+}
+
+__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on creen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+
+				// TODO:(Nick) dot product of normals < 0 means the point
+				// should be ignored with a weight of 0 since it is facing the wrong direction
+				// May be good to simply weight using the dot product to give
+				// a stronger weight to those whose normals are closer
+
+				float weight = ftl::cuda::spatialWeighting(length(pf - camPos), h);
+
+				if (weight > 0.0f) {
+					float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
+					if (dot(make_float3(norm), make_float3(n2)) > 0.0f) {
+
+						uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+
+						if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) {
+							pos += weight*camPos; // (camera.pose * camPos);
+							weights += weight;
+						}
+					}
+				}			
+			//}
+		}
+	}
+
+	if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights;
+
+	return weights;
+}
+
+
+__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	const DepthCameraCUDA &mainCamera = c_cameras[cam];
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				//const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				//if (screenPos.x < output.width() && screenPos.y < output.height()) {
+				//	output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				//}
+			}
+		}
+	}
+}
+
+void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+
+// ===== Render Depth using MLS ================================================
+
+#define MAX_UPSAMPLE 5
+#define SAMPLE_BUFFER ((2*MAX_UPSAMPLE+1)*(2*MAX_UPSAMPLE+1))
+#define WARP_SIZE 32
+#define BLOCK_WIDTH 4
+#define MLS_RADIUS 5
+#define MLS_WIDTH (2*MLS_RADIUS+1)
+#define MLS_SAMPLES (MLS_WIDTH*MLS_WIDTH)
+
+__global__ void mls_render_depth_kernel(const TextureObject<int> input, TextureObject<int> output, SplatParams params, int numcams) {
+	/*const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+		const float4 norm = tex2D<float4>(mainCamera.normal, x, y);
+		//if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						//if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f));
+
+						// Previous approach
+						//if (cam2 == cam) continue;
+						//weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				//if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f);
+
+				const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				if (screenPos.x < output.width() && screenPos.y < output.height()) {
+					output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				}
+			}
+		}
+	}*/
+}
+
+
+void ftl::cuda::mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const SplatParams &params, int numcams, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_render_depth_kernel<<<gridSize, blockSize, 0, stream>>>(input, output, params, numcams);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
diff --git a/applications/reconstruct/src/mls_cuda.hpp b/applications/reconstruct/src/mls_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..420b3beb0a5603abbc1a34e5cfeba5b99912408d
--- /dev/null
+++ b/applications/reconstruct/src/mls_cuda.hpp
@@ -0,0 +1,377 @@
+#ifndef _FTL_MLS_CUDA_HPP_
+#define _FTL_MLS_CUDA_HPP_
+
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include "splat_params.hpp"
+
+__device__ inline float3 make_float3(const uchar4 &c) {
+	return make_float3((float)c.x,(float)c.y,(float)c.z);
+}
+
+__device__ inline uchar4 make_uchar4(const float3 &c) {
+	return make_uchar4(c.x,c.y,c.z,255);
+}
+
+namespace ftl {
+namespace cuda {
+
+/*
+ * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007.
+ * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras
+ *     r = distance between points
+ *     h = smoothing parameter in meters (default 4cm)
+ */
+__device__ inline float spatialWeighting(float r, float h) {
+	if (r >= h) return 0.0f;
+	float rh = r / h;
+	rh = 1.0f - rh*rh;
+	return rh*rh*rh*rh;
+}
+
+__device__ float colourWeighting(float c);
+
+struct fragment {
+	float3 point;
+	float3 normal;
+	uchar4 colour;
+};
+
+__device__ inline float3 upsampled_point(cudaTextureObject_t pointset, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+
+	// R is the centroid of surrounding points.
+	R /= (D1+D2+D3+D4);
+
+	// FIXME: Should not use centroid but instead sample the surface at this point
+	// Use plane estimate at point to get "centroid" and then do the spatial weighted sample?
+	return R;
+}
+
+__device__ inline fragment upsampled_point(cudaTextureObject_t pointset,
+		cudaTextureObject_t normals, cudaTextureObject_t colours, const float2 &uv) {
+	float3 R = make_float3(0.0f, 0.0f, 0.0f);
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+	float3 C = make_float3(0.0f, 0.0f, 0.0f);
+
+	// TODO:(Nick) Don't upsample points if distance is too great
+
+	const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y)));
+	const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y)));
+	R += D1 * P1;
+	N += D1 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y)));
+	C += D1 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y)));
+
+	const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f)));
+	const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f)));
+	R += D2 * P2;
+	N += D2 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y+1.0f)));
+	C += D2 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y+1.0f)));
+
+	const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y)));
+	const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y)));
+	R += D3 * P3;
+	N += D3 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y)));
+	C += D3 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y)));
+
+	const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f)));
+	const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f)));
+	R += D4 * P4;
+	N += D4 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y+1.0f)));
+	C += D4 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y+1.0f)));
+
+	return {R / (D1+D2+D3+D4), N / (D1+D2+D3+D4), make_uchar4(C / (D1+D2+D3+D4))};
+}
+
+__device__ inline void render_depth(ftl::cuda::TextureObject<int> &depth, ftl::render::SplatParams &params, const float3 &worldPos) {
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth.width() && cy < depth.height()) {
+		atomicMin(&depth(cx,cy), d * 1000.0f);
+	}
+}
+
+__device__ inline void render_fragment(
+		ftl::cuda::TextureObject<int> &depth_in,
+		ftl::cuda::TextureObject<float4> &normal_out,
+		ftl::cuda::TextureObject<uchar4> &colour_out,
+		ftl::render::SplatParams &params, const fragment &frag) {
+	const float3 camPos = params.m_viewMatrix * frag.point;
+	const float d = camPos.z;
+	if (d < params.camera.m_sensorDepthWorldMin) return;
+
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+	const unsigned int cx = screenPos.x;
+	const unsigned int cy = screenPos.y;
+
+	if (cx < depth_in.width() && cy < depth_in.height()) {
+		if (depth_in(cx,cy) == (int)(d * 1000.0f)) {
+			colour_out(cx,cy) = frag.colour;
+			normal_out(cx,cy) = make_float4(frag.normal, 0.0f);
+		}
+	}
+}
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float3 screen_centroid(
+		cudaTextureObject_t pointset,
+		const float2 &suv,
+		const int2 &uv,
+		const ftl::render::SplatParams &params,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = params.m_viewMatrix * make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(suv - params.camera.cameraToKinectScreenFloat(samplePoint)), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) pos = pos / weights;
+	return pos;
+}
+
+/**
+ * Estimate a point set surface point from an existing point in the set.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+};
+
+/**
+ * Estimate the point set surface location near to a given point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				pos += weight*samplePoint;
+				weights += weight;
+			//}
+		}
+	}
+
+	if (weights > 0.0f) estPoint = pos / weights;
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int R>
+__device__ float mls_point_energy(
+		cudaTextureObject_t pointset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+				weights += weight;
+			//}
+		}
+	}
+
+	return weights;
+}
+
+/**
+ * Calculate the point sample energy.
+ */
+template <int M>
+__device__ float mls_point_energy(
+		const float3 (&pointset)[M],
+		const float3 &nearPoint,
+		float smoothing) {
+
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int i=0; i<M; ++i) {
+		const float3 samplePoint = pointset[i];
+		const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing);
+		weights += weight;
+	}
+
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near an existing and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+	const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y));
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+/**
+ * Estimate a point set surface location near a given point and return also
+ * an estimate of the normal and colour of that point.
+ */
+template <int R>
+__device__ float mls_point_surface(
+		cudaTextureObject_t pointset,
+		cudaTextureObject_t normalset,
+		cudaTextureObject_t colourset,
+		const int2 &uv,
+		const float3 &nearPoint,
+		float3 &estPoint,
+		float3 &estNormal,
+		uchar4 &estColour,
+		float smoothing) {
+
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 colour = make_float3(0.0f, 0.0f, 0.0f);
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-R; v<=R; ++v) {
+		for (int u=-R; u<=R; ++u) {
+			//if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) {
+				const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v));
+				const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing);
+
+				if (weight > 0.0f) {
+					pos += weight*samplePoint;
+					weights += weight;
+
+					normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v));
+					const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v);
+					colour += weight * make_float3(c.x, c.y, c.z);
+				}
+			//}
+		}
+	}
+
+	if (weights > 0.0f) {
+		estPoint = pos / weights;
+		estNormal = normal / weights;
+		estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255);
+	}
+	return weights;
+}
+
+}
+}
+
+#endif  // _FTL_MLS_CUDA_HPP_
diff --git a/applications/reconstruct/src/splat_params.hpp b/applications/reconstruct/src/splat_params.hpp
index e38e4447286bf6c70f6c753deed35154b6aafd8a..cb2d3febda3364a3407264711620a25f9dc116a1 100644
--- a/applications/reconstruct/src/splat_params.hpp
+++ b/applications/reconstruct/src/splat_params.hpp
@@ -9,6 +9,7 @@ namespace ftl {
 namespace render {
 
 static const uint kShowBlockBorders = 0x0001;
+static const uint kNoSplatting = 0x0002;
 
 struct __align__(16) SplatParams {
 	float4x4 m_viewMatrix;
@@ -16,6 +17,7 @@ struct __align__(16) SplatParams {
 
 	uint m_flags;
 	float voxelSize;
+	float depthThreshold;
 
 	DepthCameraParams camera;
 };
diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp
index 92d70a1c50d06e985fcb0faa1ef8f0bd5b961f86..1f7a18f66af2a0e6cd30fac3e4c139355028f6db 100644
--- a/applications/reconstruct/src/splat_render.cpp
+++ b/applications/reconstruct/src/splat_render.cpp
@@ -24,9 +24,18 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) {
 		depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
 	}
+	if ((unsigned int)depth3_.width() != camera.width || (unsigned int)depth3_.height() != camera.height) {
+		depth3_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
+	}
 	if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) {
 		colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height);
 	}
+	if ((unsigned int)colour_tmp_.width() != camera.width || (unsigned int)colour_tmp_.height() != camera.height) {
+		colour_tmp_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
+	if ((unsigned int)normal1_.width() != camera.width || (unsigned int)normal1_.height() != camera.height) {
+		normal1_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height);
+	}
 	if ((unsigned int)depth2_.width() != camera.width || (unsigned int)depth2_.height() != camera.height) {
 		depth2_ = ftl::cuda::TextureObject<float>(camera.width, camera.height);
 	}
@@ -56,30 +65,31 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	if (scene_->value("voxels", false)) {
 		// TODO:(Nick) Stereo for voxel version
 		ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream);
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
-		ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		//ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
 		src->writeFrames(colour1_, depth2_, stream);
 	} else {
-		//ftl::cuda::clear_colour(colour1_, stream);
 		ftl::cuda::clear_depth(depth1_, stream);
+		ftl::cuda::clear_depth(depth3_, stream);
 		ftl::cuda::clear_depth(depth2_, stream);
-		ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream);
-		//ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream);
+		ftl::cuda::clear_colour(colour2_, stream);
+		ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 
-		// Splat the depth from first DIBR to expand the points
-		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		// Step 1: Put all points into virtual view to gather them
+		//ftl::cuda::dibr_raw(depth1_, scene_->cameraCount(), params, stream);
 
-		// Alternative to above...
-		//ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream);
-
-
-		// Use reverse sampling to obtain more colour details
-		// Should use nearest neighbor point depths to verify which camera to use
-		//ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+		// Step 2: For each point, use a warp to do MLS and up sample
+		//ftl::cuda::mls_render_depth(depth1_, depth3_, params, scene_->cameraCount(), stream);
 
 		if (src->getChannel() == ftl::rgbd::kChanDepth) {
-			ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
-			src->writeFrames(colour1_, depth2_, stream);
+			//ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		} else if (src->getChannel() == ftl::rgbd::kChanRight) {
 			// Adjust pose to right eye position
 			Eigen::Affine3f transform(Eigen::Translation3f(camera.baseline,0.0f,0.0f));
@@ -88,10 +98,16 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 			params.m_viewMatrixInverse = MatrixConversion::toCUDA(matrix);
 
 			ftl::cuda::clear_depth(depth1_, stream);
-			ftl::cuda::dibr(depth1_, colour2_, scene_->cameraCount(), params, stream);
+			ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream);
 			src->writeFrames(colour1_, colour2_, stream);
 		} else {
-			src->writeFrames(colour1_, depth2_, stream);
+			if (src->value("splatting",  false)) {
+				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				src->writeFrames(colour2_, depth2_, stream);
+			} else {
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+				src->writeFrames(colour1_, depth2_, stream);
+			}
 		}
 	}
 
diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp
index 7737bb5d903d725a55b81d75812b99b52e7ace66..7511e859ff6471dcba18d5aee5518f55045eef9b 100644
--- a/applications/reconstruct/src/splat_render.hpp
+++ b/applications/reconstruct/src/splat_render.hpp
@@ -33,9 +33,12 @@ class Splatter {
 	private:
 	int device_;
 	ftl::cuda::TextureObject<int> depth1_;
+	ftl::cuda::TextureObject<int> depth3_;
 	ftl::cuda::TextureObject<uchar4> colour1_;
+	ftl::cuda::TextureObject<float4> colour_tmp_;
 	ftl::cuda::TextureObject<float> depth2_;
 	ftl::cuda::TextureObject<uchar4> colour2_;
+	ftl::cuda::TextureObject<float4> normal1_;
 	SplatParams params_;
 	ftl::voxhash::SceneRep *scene_;
 };
diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp
index d8fd9cf55649280c20805c8d4f8b9f7b758b2223..f8cbdbb6bd87f7ea9e2275e000ad75de87e9dd11 100644
--- a/applications/reconstruct/src/splat_render_cuda.hpp
+++ b/applications/reconstruct/src/splat_render_cuda.hpp
@@ -10,6 +10,80 @@
 namespace ftl {
 namespace cuda {
 
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0 - l0, n) / denom; 
+        return (t >= 0); 
+    } 
+ 
+    return false; 
+}
+
+__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l, float &t) { 
+    // assuming vectors are all normalized
+    float denom = dot(n, l); 
+    if (denom > 1e-6) {  
+        t = dot(p0, n) / denom; 
+        return (t >= 0); 
+    }
+    return false; 
+}
+
+__device__ inline bool intersectDisk(const float3 &n, const float3 &p0, float radius, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        float3 p = l0 + l * t; 
+        float3 v = p - p0; 
+        float d2 = dot(v, v); 
+        return (sqrt(d2) <= radius); 
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return false; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l0, l, t)) { 
+        const float3 p = l0 + l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
+/**
+ * Get the radius of a ray intersection with a disk.
+ * @param n Normalised normal of disk.
+ * @param p0 Centre of disk in camera space
+ * @param l Normalised ray direction in camera space
+ * @return Radius from centre of disk where intersection occurred.
+ */
+__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l) { 
+    float t = 0; 
+    if (intersectPlane(n, p0, l, t)) { 
+        const float3 p = l * t; 
+        const float3 v = p - p0; 
+        const float d2 = dot(v, v); 
+        return sqrt(d2);
+        // or you can use the following optimisation (and precompute radius^2)
+        // return d2 <= radius2; // where radius2 = radius * radius
+     }
+     return PINF; 
+}
+
 /**
  * NOTE: Not strictly isosurface currently since it includes the internals
  * of objects up to at most truncation depth.
@@ -26,11 +100,22 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData,
 // TODO: isosurface_point_cloud
 
 void splat_points(const ftl::cuda::TextureObject<int> &depth_in,
+		const ftl::cuda::TextureObject<uchar4> &colour_in,
+		const ftl::cuda::TextureObject<float4> &normal_in,
 		const ftl::cuda::TextureObject<float> &depth_out,
-		const ftl::render::SplatParams &params, cudaStream_t stream);
+		const ftl::cuda::TextureObject<uchar4> &colour_out, const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<int> &depth_out,
-		const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams,
+		const ftl::cuda::TextureObject<uchar4> &colour_out,
+		const ftl::cuda::TextureObject<float4> &normal_out,
+        const ftl::cuda::TextureObject<float> &confidence_out,
+        const ftl::cuda::TextureObject<float4> &tmp_colour, int numcams,
+		const ftl::render::SplatParams &params, cudaStream_t stream);
+
+/**
+ * Directly render input depth maps to virtual view with clipping.
+ */
+void dibr_raw(const ftl::cuda::TextureObject<int> &depth_out, int numcams,
 		const ftl::render::SplatParams &params, cudaStream_t stream);
 
 void dibr(const ftl::cuda::TextureObject<float> &depth_out,
diff --git a/applications/reconstruct/src/voxel_hash.cpp b/applications/reconstruct/src/voxel_hash.cpp
index 14cb75078652001275404e87c09f77f0b9d19385..6f929c746d66cae5c382bf15b2d25e26f6b46702 100644
--- a/applications/reconstruct/src/voxel_hash.cpp
+++ b/applications/reconstruct/src/voxel_hash.cpp
@@ -1,4 +1,5 @@
 #include <ftl/voxel_hash.hpp>
+#include <loguru.hpp>
 
 using ftl::voxhash::HashData;
 using ftl::voxhash::HashParams;
diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/voxel_render.cu
similarity index 65%
rename from applications/reconstruct/src/splat_render.cu
rename to applications/reconstruct/src/voxel_render.cu
index 3addf108dcf89ea8f3069e3ba0daea59e67ae918..46eb6f57c031b0541add1dcd982a9c6d1e690140 100644
--- a/applications/reconstruct/src/splat_render.cu
+++ b/applications/reconstruct/src/voxel_render.cu
@@ -258,143 +258,4 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
 #endif
 }
 
-// ---- Pass 2: Expand the point splats ----------------------------------------
-
-#define SPLAT_RADIUS 7
-#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1)
-#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS)
-#define MAX_VALID 100
-
-__device__ float distance2(float3 a, float3 b) {
-	const float x = a.x-b.x;
-	const float y = a.y-b.y;
-	const float z = a.z-b.z;
-	return x*x+y*y+z*z;
-}
-
-__global__ void splatting_kernel(
-		TextureObject<int> depth_in,
-		TextureObject<float> depth_out, SplatParams params) {
-	// Read an NxN region and
-	// - interpolate a depth value for this pixel
-	// - interpolate an rgb value for this pixel
-	// Must respect depth discontinuities.
-	// How much influence a given neighbour has depends on its depth value
-
-	__shared__ float3 positions[SPLAT_BUFFER_SIZE];
-
-	const int i = threadIdx.y*blockDim.y + threadIdx.x;
-	const int bx = blockIdx.x*blockDim.x;
-	const int by = blockIdx.y*blockDim.y;
-	const int x = bx + threadIdx.x;
-	const int y = by + threadIdx.y;
-
-	// const float camMinDepth = params.camera.m_sensorDepthWorldMin;
-	// const float camMaxDepth = params.camera.m_sensorDepthWorldMax;
-
-	for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) {
-		const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-		const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-		if (sx >= depth_in.width() || sy >= depth_in.height()) {
-			positions[j] = make_float3(1000.0f,1000.0f, 1000.0f);
-		} else {
-			positions[j] = params.camera.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f);
-		}
-	}
 
-	__syncthreads();
-
-	if (x >= depth_in.width() || y >= depth_in.height()) return;
-
-	const float voxelSquared = params.voxelSize*params.voxelSize;
-	float mindepth = 1000.0f;
-	int minidx = -1;
-	// float3 minpos;
-
-	//float3 validPos[MAX_VALID];
-	int validIndices[MAX_VALID];
-	int validix = 0;
-
-	for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) {
-		for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) {
-			//const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u;
-			const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS;
-
-			float3 posp = positions[idx];
-			const float d = posp.z;
-			//if (d < camMinDepth || d > camMaxDepth) continue;
-
-			float3 pos = params.camera.kinectDepthToSkeleton(x, y, d);
-			float dist = distance2(pos, posp);
-
-			if (dist < voxelSquared) {
-				// Valid so check for minimum
-				//validPos[validix] = pos;
-				//validIndices[validix++] = idx;
-				if (d < mindepth) {
-					mindepth = d;
-					minidx = idx;
-					// minpos = pos;
-				}	
-			}
-		}
-	}
-
-	if (minidx == -1) {
-		depth_out(x,y) = 0.0f;
-		//colour_out(x,y) = make_uchar4(76,76,82,255);
-		return;
-	}
-
-	//float3 colour = make_float3(0.0f, 0.0f, 0.0f);
-	float depth = 0.0f;
-	float contrib = 0.0f;
-	float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth);  // TODO:(Nick) Mindepth assumption is poor choice.
-
-	//for (int j=0; j<validix; ++j) {
-		const int idx = minidx; //validIndices[j];
-		float3 posp = positions[idx];
-		//float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z);
-		float3 delta = (posp - pos) / 2*params.voxelSize;
-		float dist = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-
-		// It contributes to pixel
-		if (dist < 1.0f && fabs(posp.z - mindepth) < 2*params.voxelSize) {
-			const unsigned int sx = (idx % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
-			const unsigned int sy = (idx / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
-
-			// Fast and simple trilinear interpolation
-			float c = fabs((1.0f - delta.x) * (1.0f - delta.y) * (1.0f - delta.z));
-			//uchar4 col = colour_in.tex2D((int)sx, (int)sy);
-			//colour.x += col.x*c;
-			//colour.y += col.y*c;
-			//colour.z += col.z*c;
-			contrib += c;
-			depth += posp.z * c;
-		}
-	//}
-
-	// Normalise
-	//colour.x /= contrib;
-	//colour.y /= contrib;
-	//colour.z /= contrib;
-	depth /= contrib;
-
-	depth_out(x,y) = depth;
-	//colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255);
-}
-
-void ftl::cuda::splat_points(const TextureObject<int> &depth_in,
-		const TextureObject<float> &depth_out, const SplatParams &params, cudaStream_t stream) 
-{
-
-	const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
-
-	splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params);
-	cudaSafeCall( cudaGetLastError() );
-
-#ifdef _DEBUG
-	cudaSafeCall(cudaDeviceSynchronize());
-#endif
-}
diff --git a/components/common/cpp/include/ftl/configurable.hpp b/components/common/cpp/include/ftl/configurable.hpp
index 5bae67b4a9972d5f69d0947871d5be259df36be3..94a080eaed480fa2e57b113e9c8c3d7bc04388bd 100644
--- a/components/common/cpp/include/ftl/configurable.hpp
+++ b/components/common/cpp/include/ftl/configurable.hpp
@@ -68,7 +68,9 @@ class Configurable {
 	template <typename T>
 	T value(const std::string &name, T def) {
 		auto r = get<T>(name);
-		return (r) ? *r : def;
+		if (r) return *r;
+		(*config_)[name] = def;
+		return def;
 	}
 
 	/**
diff --git a/components/rgbd-sources/include/ftl/rgbd/detail/source.hpp b/components/rgbd-sources/include/ftl/rgbd/detail/source.hpp
index 7557fecb3da30af7a818823904a9b71bba3bc057..b76dd27fac1a7f09a07612629c9d6aae8ef47eb3 100644
--- a/components/rgbd-sources/include/ftl/rgbd/detail/source.hpp
+++ b/components/rgbd-sources/include/ftl/rgbd/detail/source.hpp
@@ -49,10 +49,22 @@ class Source {
 	virtual ~Source() {}
 
 	/**
+	 * Perform hardware data capture.
+	 */
+	virtual bool capture() { return true; };
+
+	/**
+	 * Do any processing from previously captured frames...
 	 * @param n Number of frames to request in batch. Default -1 means automatic (10)
 	 * @param b Bit rate setting. -1 = automatic, 0 = best quality, 9 = lowest quality
 	 */
-	virtual bool grab(int n, int b)=0;
+	virtual bool compute(int n, int b)=0;
+
+	/**
+	 * Between frames, or before next frame, do any buffer swapping operations.
+	 */
+	virtual void swap() {}
+
 	virtual bool isReady() { return false; };
 	virtual void setPose(const Eigen::Matrix4d &pose) { };
 
diff --git a/components/rgbd-sources/include/ftl/rgbd/source.hpp b/components/rgbd-sources/include/ftl/rgbd/source.hpp
index 4efb97175e886ba821f752f7f154eddde0aa1f8f..55cb8554e9eded1b661a89cdf917025369467ad2 100644
--- a/components/rgbd-sources/include/ftl/rgbd/source.hpp
+++ b/components/rgbd-sources/include/ftl/rgbd/source.hpp
@@ -71,19 +71,34 @@ class Source : public ftl::Configurable {
 	/**
 	 * Perform the hardware or virtual frame grab operation. 
 	 */
-	bool grab(int N=-1, int B=-1);
+	bool capture();
+
+	/**
+	 * Between frames, do any required buffer swaps.
+	 */
+	void swap() { if (impl_) impl_->swap(); }
 
 	/**
 	 * Do any post-grab processing. This function
 	 * may take considerable time to return, especially for sources requiring
-	 * software stereo correspondance. If `process` is not called manually
-	 * after a `grab` and before a `get`, then it will be called automatically
-	 * on first `get`.
+	 * software stereo correspondance.
 	 */
-	//void process();
+	bool compute(int N=-1, int B=-1);
+
+	/**
+	 * Wrapper grab that performs capture, swap and computation steps in one.
+	 * It is more optimal to perform capture and compute in parallel.
+	 */
+	bool grab(int N=-1, int B=-1) {
+		bool c = capture();
+		swap();
+		return c && compute(N,B);
+	}
 
 	/**
-	 * Get a copy of both colour and depth frames.
+	 * Get a copy of both colour and depth frames. Note that this does a buffer
+	 * swap rather than a copy, so the parameters should be persistent buffers for
+	 * best performance.
 	 */
 	void getFrames(cv::Mat &c, cv::Mat &d);
 
diff --git a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
index 73d853e9c2fb2fce7728a59c7956136b972c0ec6..b6a860db28fa4f9c11d4ec89eb55e0d8f87b4c2c 100644
--- a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
+++ b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
@@ -12,58 +12,97 @@ using cv::cuda::GpuMat;
 
 FixstarsSGM::FixstarsSGM(nlohmann::json &config) : Disparity(config) {
 	ssgm_ = nullptr;
+
+	int width = value("width", 1280);
+	int height = value("height", 720);
+	
+	size_ = cv::Size(width, height);
+	CHECK((width >= 480) && (height >= 360));
+
 	uniqueness_ = value("uniqueness", 0.95f);
+	P1_ = value("P1", 10);
+	P2_ = value("P2", 120);
+
+	CHECK((uniqueness_ >= 0.0) && (uniqueness_ <= 1.0));
+	CHECK(P1_ >= 0);
+	CHECK(P2_ > P1_);
+
 	use_filter_ = value("use_filter", false);
-	filter_ = cv::cuda::createDisparityBilateralFilter(max_disp_ << 4, value("filter_radius", 25), value("filter_iter", 1));
+	if (use_filter_) {
+		int radius = value("filter_radius", 25);
+		int iter = value("filter_iter", 1);
+		CHECK(radius > 0) << "filter_radius must be greater than 0";
+		CHECK(iter > 0) << "filter_iter must be greater than 0";
+
+		filter_ = cv::cuda::createDisparityBilateralFilter(max_disp_ << 4, radius, iter);
+	}
+
+	init(size_);
 }
 
-void FixstarsSGM::compute(const cv::cuda::GpuMat &l, const cv::cuda::GpuMat &r, cv::cuda::GpuMat &disp, cv::cuda::Stream &stream) {
-	cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGR2GRAY, 0, stream);
-	cv::cuda::cvtColor(r, rbw_, cv::COLOR_BGR2GRAY, 0, stream);
+void FixstarsSGM::init(const cv::Size size) {
+	if (ssgm_) { delete ssgm_; }
+	lbw_ = GpuMat(size, CV_8UC1);
+	rbw_ = GpuMat(size, CV_8UC1);
+	dispt_ = GpuMat(size, CV_16SC1);
 
-	stream.waitForCompletion();
-	if (!ssgm_) { // todo: move to constructor
-		dispt_ = GpuMat(l.rows, l.cols, CV_16SC1);
-		ssgm_ = new sgm::StereoSGM(l.cols, l.rows, max_disp_, 8, 16, lbw_.step, dispt_.step / sizeof(short),
-			sgm::EXECUTE_INOUT_CUDA2CUDA, sgm::StereoSGM::Parameters(10, 120, uniqueness_, true));
+	ssgm_ = new sgm::StereoSGM(size.width, size.height, max_disp_, 8, 16,
+		lbw_.step, dispt_.step / sizeof(short),
+		sgm::EXECUTE_INOUT_CUDA2CUDA,
+		sgm::StereoSGM::Parameters(P1_, P2_, uniqueness_, true)
+	);
+}
+
+void FixstarsSGM::compute(const cv::cuda::GpuMat &l, const cv::cuda::GpuMat &r,
+	cv::cuda::GpuMat &disp, cv::cuda::Stream &stream)
+{
+	if (l.size() != size_) {
+		// re-use same buffer for l/r
+		cv::cuda::resize(r, l_downscaled_, size_, 0.0, 0.0, cv::INTER_CUBIC, stream);
+		cv::cuda::cvtColor(l_downscaled_, rbw_, cv::COLOR_BGR2GRAY, 0, stream);
+		cv::cuda::resize(l, l_downscaled_, size_, 0.0, 0.0, cv::INTER_CUBIC, stream);
+		cv::cuda::cvtColor(l_downscaled_, lbw_, cv::COLOR_BGR2GRAY, 0, stream);
 	}
+	else {
+		cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGR2GRAY, 0, stream);
+		cv::cuda::cvtColor(r, rbw_, cv::COLOR_BGR2GRAY, 0, stream);
+	}
+
+	stream.waitForCompletion();
 
-	//auto start = std::chrono::high_resolution_clock::now();
 	ssgm_->execute(lbw_.data, rbw_.data, dispt_.data);
-	//std::chrono::duration<double> elapsed =
-	//		std::chrono::high_resolution_clock::now() - start;
-	//LOG(INFO) << "CUDA sgm in " << elapsed.count() << "s";
-	
-	// todo: fix libSGM (return float data or provide mask separately)
-	// disparity values set to (256 << 5) in libSGM consistency check 
-	//Mat bad_pixels = (disp == (256 << 5)); 
-	
-	//disp.setTo(0, bad_pixels, stream_);
+
 	GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
 	left_pixels.setTo(0);
-
 	cv::cuda::threshold(dispt_, dispt_, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, stream);
 
+	// TODO: filter could be applied after upscaling (to the upscaled disparity image)
 	if (use_filter_) {
-		// parameters need benchmarking, impact of image
-		// quick tests show with parameters (max_disp_, 25, 3)
-		// roughly 50% in disparity calculation and 50% in filter;
-		filter_->apply(dispt_, l, dispt_, stream);
+		filter_->apply(dispt_,
+			l.size() != dispt_.size() ? l_downscaled_ : l,
+			dispt_,
+			stream
+		);
 	}
-	
-	dispt_.convertTo(disp, CV_32F, 1.0f/16.0f, stream);
+
+	if (l.size() != size_) {
+		cv::cuda::multiply(dispt_, (double)l.cols / (double)size_.width, dispt_);
+		// invalid areas (bad values) have to be taken into account in interpolation
+		cv::cuda::resize(dispt_, dispt_full_res_, l.size(), 0.0, 0.0, cv::INTER_NEAREST, stream);
+	}
+	else {
+		dispt_full_res_ = dispt_;
+	}
+
+	dispt_full_res_.convertTo(disp, CV_32F, 1.0f / 16.0f, stream);
 }
 
 void FixstarsSGM::setMask(Mat &mask) {
 	return; // TODO(Nick) Not needed, but also code below does not work with new GPU pipeline
 	CHECK(mask.type() == CV_8UC1) << "mask type must be CV_8U";
-	
-	if (!ssgm_) { // todo: move to constructor
-		ssgm_ = new sgm::StereoSGM(mask.cols, mask.rows, max_disp_, 8, 16,
-			sgm::EXECUTE_INOUT_HOST2HOST,
-			sgm::StereoSGM::Parameters(10, 120, uniqueness_, true));
-	}
-	
+
+	if (!ssgm_) { init(size_); }
+
 	mask_l_ = mask;
-	ssgm_->setMask((uint8_t*) mask.data, mask.cols);
+	ssgm_->setMask((uint8_t*)mask.data, mask.cols);
 }
\ No newline at end of file
diff --git a/components/rgbd-sources/src/algorithms/fixstars_sgm.hpp b/components/rgbd-sources/src/algorithms/fixstars_sgm.hpp
index 95511cb2cdc9cde452d6ffe47ec4e3c9fb99c2f9..08d01a535494d10469c99e8f627c8c92571753dd 100644
--- a/components/rgbd-sources/src/algorithms/fixstars_sgm.hpp
+++ b/components/rgbd-sources/src/algorithms/fixstars_sgm.hpp
@@ -15,37 +15,45 @@
 #include "ftl/cb_segmentation.hpp"
 
 namespace ftl {
-namespace algorithms {
-
-/**
- * Fixstars libSGM stereo matcher. 
- * @see https://github.com/fixstars/libSGM
- *
- * NOTE: We are using a modified version that supports disparity of 256.
- * @see https://github.com/knicos/libSGM
- */
-class FixstarsSGM : public ftl::rgbd::detail::Disparity {
-	public:
-	explicit FixstarsSGM(nlohmann::json &config);
-
-	void compute(const cv::cuda::GpuMat &l, const cv::cuda::GpuMat &r, cv::cuda::GpuMat &disp, cv::cuda::Stream &stream) override;
-	void setMask(cv::Mat &mask) override;
-	
-	/* Factory creator */
-	static inline Disparity *create(ftl::Configurable *p, const std::string &name) {
-		return ftl::create<FixstarsSGM>(p, name);
-	}
-
-	private:
-	float uniqueness_;
-	bool use_filter_;
-	cv::Ptr<cv::cuda::DisparityBilateralFilter> filter_;
-	sgm::StereoSGM *ssgm_;
-	cv::cuda::GpuMat lbw_;
-	cv::cuda::GpuMat rbw_;
-	cv::cuda::GpuMat dispt_;
-};
-};
+	namespace algorithms {
+
+		/**
+		 * Fixstars libSGM stereo matcher.
+		 * @see https://github.com/fixstars/libSGM
+		 *
+		 * NOTE: We are using a modified version that supports disparity of 256.
+		 * @see https://github.com/knicos/libSGM
+		 */
+		class FixstarsSGM : public ftl::rgbd::detail::Disparity {
+		public:
+			explicit FixstarsSGM(nlohmann::json &config);
+
+			void compute(const cv::cuda::GpuMat &l, const cv::cuda::GpuMat &r, cv::cuda::GpuMat &disp, cv::cuda::Stream &stream) override;
+			void setMask(cv::Mat &mask) override;
+
+			/* Factory creator */
+			static inline Disparity *create(ftl::Configurable *p, const std::string &name) {
+				return ftl::create<FixstarsSGM>(p, name);
+			}
+
+		private:
+			void init(const cv::Size size);
+
+			float uniqueness_;
+			int P1_;
+			int P2_;
+			cv::Size size_;
+			bool use_filter_;
+			cv::Ptr<cv::cuda::DisparityBilateralFilter> filter_;
+			sgm::StereoSGM *ssgm_;
+			cv::cuda::GpuMat lbw_;
+			cv::cuda::GpuMat rbw_;
+			cv::cuda::GpuMat dispt_;
+
+			cv::cuda::GpuMat l_downscaled_;
+			cv::cuda::GpuMat dispt_full_res_;
+		};
+	};
 };
 
 #endif  // _FTL_ALGORITHMS_FIXSTARS_SGM_HPP_
diff --git a/components/rgbd-sources/src/bitrate_settings.hpp b/components/rgbd-sources/src/bitrate_settings.hpp
index 6b6e1508ed87dac7e24f7af94b903782e2d9a3ce..61e3ec7e48447d0a72f7b67afe81437300fbf944 100644
--- a/components/rgbd-sources/src/bitrate_settings.hpp
+++ b/components/rgbd-sources/src/bitrate_settings.hpp
@@ -13,7 +13,7 @@ struct BitrateSetting {
 };
 
 static const BitrateSetting bitrate_settings[] = {
-	1280, 720, 95, 1,
+	1920, 1080, 95, 1,
 	1280, 720, 95, 1,
 	1280, 720, 95, 1,
 	1280, 720, 75, 1,
diff --git a/components/rgbd-sources/src/calibrate.cpp b/components/rgbd-sources/src/calibrate.cpp
index 17090992f36bebe355144e4ce714acf891005463..acc580e2aefdd63a3531e168e02b69c06aa855ca 100644
--- a/components/rgbd-sources/src/calibrate.cpp
+++ b/components/rgbd-sources/src/calibrate.cpp
@@ -53,7 +53,7 @@ Calibrate::Calibrate(nlohmann::json &config, cv::Size image_size, cv::cuda::Stre
 	else {
 		LOG(WARNING) << "Calibration not loaded";
 	}
-	
+
 	this->on("use_intrinsics", [this](const ftl::config::Event &e) {
 		_updateIntrinsics();
 	});
@@ -70,7 +70,7 @@ bool Calibrate::_loadCalibration(cv::Size img_size, std::pair<Mat, Mat> &map1, s
 			LOG(WARNING) << "Could not open intrinsics file";
 			return false;
 		}
-		
+
 		LOG(INFO) << "Intrinsics from: " << *ifile;
 	}
 	else {
@@ -82,7 +82,7 @@ bool Calibrate::_loadCalibration(cv::Size img_size, std::pair<Mat, Mat> &map1, s
 		vector<Mat> K, D;
 		fs["K"] >> K;
 		fs["D"] >> D;
-		
+
 		K[0].copyTo(M1_);
 		K[1].copyTo(M2_);
 		D[0].copyTo(D1_);
@@ -101,9 +101,10 @@ bool Calibrate::_loadCalibration(cv::Size img_size, std::pair<Mat, Mat> &map1, s
 			LOG(WARNING) << "Could not open extrinsics file";
 			return false;
 		}
-		
+
 		LOG(INFO) << "Extrinsics from: " << *efile;
-	} else {
+	}
+	else {
 		LOG(WARNING) << "Calibration extrinsics file not found";
 		return false;
 	}
@@ -115,9 +116,26 @@ bool Calibrate::_loadCalibration(cv::Size img_size, std::pair<Mat, Mat> &map1, s
 	fs["P1"] >> P1_;
 	fs["P2"] >> P2_;
 	fs["Q"] >> Q_;
-
 	img_size_ = img_size;
 
+	// TODO: normalize calibration
+	double scale_x = 1.0 / 1280.0;
+	double scale_y = 1.0 / 720.0;
+
+	Mat scale(cv::Size(3, 3), CV_64F, 0.0);
+	scale.at<double>(0, 0) = (double) img_size.width * scale_x;
+	scale.at<double>(1, 1) = (double) img_size.height * scale_y;
+	scale.at<double>(2, 2) = 1.0;
+
+	M1_ = scale * M1_;
+	M2_ = scale * M2_;
+	P1_ = scale * P1_;
+	P2_ = scale * P2_;
+
+	Q_.at<double>(0, 3) = Q_.at<double>(3, 2) * (double) img_size.width * scale_x;
+	Q_.at<double>(1, 3) = Q_.at<double>(3, 2) * (double) img_size.height * scale_y;
+	Q_.at<double>(3, 3) = Q_.at<double>(3, 3) * (double) img_size.width * scale_x;
+
 	// cv::cuda::remap() works only with CV_32FC1
 	initUndistortRectifyMap(M1_, D1_, R1_, P1_, img_size_, CV_32FC1, map1.first, map2.first);
 	initUndistortRectifyMap(M2_, D2_, R2_, P2_, img_size_, CV_32FC1, map1.second, map2.second);
@@ -128,10 +146,10 @@ bool Calibrate::_loadCalibration(cv::Size img_size, std::pair<Mat, Mat> &map1, s
 void Calibrate::updateCalibration(const ftl::rgbd::Camera &p) {
 	std::pair<Mat, Mat> map1, map2;
 
-	Q_.at<double>(3,2) = 1.0 / p.baseline;
-	Q_.at<double>(2,3) = p.fx;
-	Q_.at<double>(0,3) = p.cx;
-	Q_.at<double>(1,3) = p.cy;
+	Q_.at<double>(3, 2) = 1.0 / p.baseline;
+	Q_.at<double>(2, 3) = p.fx;
+	Q_.at<double>(0, 3) = p.cx;
+	Q_.at<double>(1, 3) = p.cy;
 
 	// FIXME:(Nick) Update camera matrix also...
 	_updateIntrinsics();
@@ -166,7 +184,7 @@ void Calibrate::_updateIntrinsics() {
 
 	initUndistortRectifyMap(M1_, D1_, R1, P1, img_size_, CV_32FC1, map1.first, map2.first);
 	initUndistortRectifyMap(M2_, D2_, R2, P2, img_size_, CV_32FC1, map1.second, map2.second);
-	
+
 	// CHECK Is this thread safe!!!!
 	map1_gpu_.first.upload(map1.first);
 	map1_gpu_.second.upload(map1.second);
@@ -176,7 +194,7 @@ void Calibrate::_updateIntrinsics() {
 
 void Calibrate::rectifyStereo(GpuMat &l, GpuMat &r, Stream &stream) {
 	// cv::cuda::remap() can not use same Mat for input and output
-	
+
 	GpuMat l_tmp(l.size(), l.type());
 	GpuMat r_tmp(r.size(), r.type());
 	cv::cuda::remap(l, l_tmp, map1_gpu_.first, map2_gpu_.first, cv::INTER_LINEAR, 0, cv::Scalar(), stream);
diff --git a/components/rgbd-sources/src/group.cpp b/components/rgbd-sources/src/group.cpp
index f548806d5b78b00428269f1f4c7d7c5bced8a197..88b9afcdca48fd141ca6487637dc7c78f9181d87 100644
--- a/components/rgbd-sources/src/group.cpp
+++ b/components/rgbd-sources/src/group.cpp
@@ -60,7 +60,9 @@ void Group::addSource(ftl::rgbd::Source *src) {
 // Callback returns true if it wishes to continue receiving frames.
 void Group::sync(int N, int B) {
 	for (auto s : sources_) {
-		s->grab(N,B);
+		s->capture();
+		s->swap();
+		s->compute(N,B);
 	}
 }
 
diff --git a/components/rgbd-sources/src/image.hpp b/components/rgbd-sources/src/image.hpp
index b389fd176ec246db3ca51180589673817897eb03..ccb6ede5c93ed32213a89cc65fe90ab5d1cfbc78 100644
--- a/components/rgbd-sources/src/image.hpp
+++ b/components/rgbd-sources/src/image.hpp
@@ -14,7 +14,7 @@ class ImageSource : public ftl::rgbd::detail::Source {
 
 	}
 
-	bool grab(int n, int b) { return false; };
+	bool compute(int n, int b) { return false; };
 	bool isReady() { return false; };
 };
 
diff --git a/components/rgbd-sources/src/middlebury_source.cpp b/components/rgbd-sources/src/middlebury_source.cpp
index d41d4e77b85d8320ef01572dcd6707af68b5b705..a707bf9fab06a212b4146065b414999d21bc9adb 100644
--- a/components/rgbd-sources/src/middlebury_source.cpp
+++ b/components/rgbd-sources/src/middlebury_source.cpp
@@ -160,7 +160,7 @@ void MiddleburySource::_performDisparity() {
 	//disparityToDepthTRUE(depth_, depth_, params_);
 }
 
-bool MiddleburySource::grab(int n, int b) {
+bool MiddleburySource::compute(int n, int b) {
 	//_performDisparity();
 	return true;
 }
diff --git a/components/rgbd-sources/src/middlebury_source.hpp b/components/rgbd-sources/src/middlebury_source.hpp
index 5f0e2be538a069747d633e3e4b9483eb8a7a75b2..25f58b05e7888ded29eed71454eb591fd30babed 100644
--- a/components/rgbd-sources/src/middlebury_source.hpp
+++ b/components/rgbd-sources/src/middlebury_source.hpp
@@ -19,7 +19,7 @@ class MiddleburySource : public detail::Source {
 	MiddleburySource(ftl::rgbd::Source *, const std::string &dir);
 	~MiddleburySource() {};
 
-	bool grab(int n, int b);
+	bool compute(int n, int b);
 	bool isReady() { return ready_; }
 
 	private:
diff --git a/components/rgbd-sources/src/net.cpp b/components/rgbd-sources/src/net.cpp
index 2e6c67a8faf2b8425db708a53dc2495859df466e..12a76cc40d38f876f11cbc4919cf7bcb13a4bee5 100644
--- a/components/rgbd-sources/src/net.cpp
+++ b/components/rgbd-sources/src/net.cpp
@@ -295,7 +295,7 @@ void NetSource::_updateURI() {
 	}
 }
 
-bool NetSource::grab(int n, int b) {
+bool NetSource::compute(int n, int b) {
 	// Choose highest requested number of frames
 	maxN_ = std::max(maxN_,(n == -1) ? ftl::rgbd::detail::kDefaultFrameCount : n);
 
diff --git a/components/rgbd-sources/src/net.hpp b/components/rgbd-sources/src/net.hpp
index b99dc3487a6f4ab1949779935e2ed41699e9f4b0..b3411f12cce2f1e0236475ca75f667429ac5e35d 100644
--- a/components/rgbd-sources/src/net.hpp
+++ b/components/rgbd-sources/src/net.hpp
@@ -24,7 +24,7 @@ class NetSource : public detail::Source {
 	explicit NetSource(ftl::rgbd::Source *);
 	~NetSource();
 
-	bool grab(int n, int b);
+	bool compute(int n, int b);
 	bool isReady();
 
 	void setPose(const Eigen::Matrix4d &pose);
diff --git a/components/rgbd-sources/src/realsense_source.cpp b/components/rgbd-sources/src/realsense_source.cpp
index d6c6f487be89c3eafd4e3e8e6020272dd93e8e9d..df4c0fe2535426ac52808ea985911968efb74e15 100644
--- a/components/rgbd-sources/src/realsense_source.cpp
+++ b/components/rgbd-sources/src/realsense_source.cpp
@@ -41,7 +41,7 @@ RealsenseSource::~RealsenseSource() {
 
 }
 
-bool RealsenseSource::grab(int n, int b) {
+bool RealsenseSource::compute(int n, int b) {
     rs2::frameset frames;
 	if (!pipe_.poll_for_frames(&frames)) return false;  //wait_for_frames();
 
diff --git a/components/rgbd-sources/src/realsense_source.hpp b/components/rgbd-sources/src/realsense_source.hpp
index 2af26bbaffb99081e3311a70cb2c715c8fca5cee..3b2d70c76c28608a3faa96af7c927bc0011804f3 100644
--- a/components/rgbd-sources/src/realsense_source.hpp
+++ b/components/rgbd-sources/src/realsense_source.hpp
@@ -17,7 +17,7 @@ class RealsenseSource : public ftl::rgbd::detail::Source {
 	RealsenseSource(ftl::rgbd::Source *host);
 	~RealsenseSource();
 
-	bool grab(int n=-1, int b=-1);
+	bool compute(int n=-1, int b=-1);
 	bool isReady();
 
 	private:
diff --git a/components/rgbd-sources/src/snapshot_source.cpp b/components/rgbd-sources/src/snapshot_source.cpp
index 030e56dd48bf48bf351917274e2f26cc0e414cd3..aa5268549bbf32de9a311a93e39aeed5e6a13555 100644
--- a/components/rgbd-sources/src/snapshot_source.cpp
+++ b/components/rgbd-sources/src/snapshot_source.cpp
@@ -15,7 +15,10 @@ using std::vector;
 
 SnapshotSource::SnapshotSource(ftl::rgbd::Source *host, SnapshotReader &reader, const string &id) : detail::Source(host) {
     Eigen::Matrix4d pose;
-    reader.getCameraRGBD(id, rgb_, depth_, pose, params_);
+    reader.getCameraRGBD(id, snap_rgb_, snap_depth_, pose, params_);
+
+	rgb_ = snap_rgb_;
+	depth_ = snap_depth_;
 
 	if (rgb_.empty()) LOG(ERROR) << "Did not load snapshot rgb - " << id;
 	if (depth_.empty()) LOG(ERROR) << "Did not load snapshot depth - " << id;
@@ -50,3 +53,9 @@ SnapshotSource::SnapshotSource(ftl::rgbd::Source *host, SnapshotReader &reader,
 
     host->setPose(pose);
 }
+
+bool SnapshotSource::compute(int n, int b) {
+	snap_rgb_.copyTo(rgb_);
+	snap_depth_.copyTo(depth_);
+	return true;
+}
diff --git a/components/rgbd-sources/src/snapshot_source.hpp b/components/rgbd-sources/src/snapshot_source.hpp
index 38b9d875ad9e9846d0c83e42d6b84668cfe7dd18..eb4e58594323b7f1167976b13d8d46c6b96c8fd0 100644
--- a/components/rgbd-sources/src/snapshot_source.hpp
+++ b/components/rgbd-sources/src/snapshot_source.hpp
@@ -17,11 +17,13 @@ class SnapshotSource : public detail::Source {
 	SnapshotSource(ftl::rgbd::Source *, ftl::rgbd::SnapshotReader &reader, const std::string &id);
 	~SnapshotSource() {};
 
-	bool grab(int n, int b) override { return true; };
+	bool compute(int n, int b);
 	bool isReady() { return true; }
 
 	//void reset();
-
+	private:
+	cv::Mat snap_rgb_;
+	cv::Mat snap_depth_;
 };
 
 }
diff --git a/components/rgbd-sources/src/source.cpp b/components/rgbd-sources/src/source.cpp
index 1014e24401b4d2dccdc361de3d4331ff2815eacb..9aee6dfbdbbcbec518ed3c3da879c7aa346c3096 100644
--- a/components/rgbd-sources/src/source.cpp
+++ b/components/rgbd-sources/src/source.cpp
@@ -169,6 +169,14 @@ void Source::getFrames(cv::Mat &rgb, cv::Mat &depth) {
 	//depth_.copyTo(depth);
 	rgb = rgb_;
 	depth = depth_;
+
+	/*cv::Mat tmp;
+	tmp = rgb;
+	rgb = rgb_;
+	rgb_ = tmp;
+	tmp = depth;
+	depth = depth_;
+	depth_ = tmp;*/
 }
 
 Eigen::Vector4d Source::point(uint ux, uint uy) {
@@ -222,14 +230,19 @@ void Source::reset() {
 	impl_ = _createImplementation();
 }
 
-bool Source::grab(int N, int B) {
+bool Source::capture() {
+	if (impl_) return impl_->capture();
+	else return true;
+}
+
+bool Source::compute(int N, int B) {
 	UNIQUE_LOCK(mutex_,lk);
 	if (!impl_ && stream_ != 0) {
 		cudaSafeCall(cudaStreamSynchronize(stream_));
 		if (depth_.type() == CV_32SC1) depth_.convertTo(depth_, CV_32F, 1.0f / 1000.0f);
 		stream_ = 0;
 		return true;
-	} else if (impl_ && impl_->grab(N,B)) {
+	} else if (impl_ && impl_->compute(N,B)) {
 		timestamp_ = impl_->timestamp_;
 		/*cv::Mat tmp;
 		rgb_.create(impl_->rgb_.size(), impl_->rgb_.type());
@@ -240,6 +253,7 @@ bool Source::grab(int N, int B) {
 		tmp = depth_;
 		depth_ = impl_->depth_;
 		impl_->depth_ = tmp;*/
+
 		impl_->rgb_.copyTo(rgb_);
 		impl_->depth_.copyTo(depth_);
 		return true;
@@ -314,7 +328,9 @@ bool Source::thumbnail(cv::Mat &t) {
 		return true;
 	} else if (impl_) {
 		UNIQUE_LOCK(mutex_,lk);
-		impl_->grab(1, 9);
+		impl_->capture();
+		impl_->swap();
+		impl_->compute(1, 9);
 		impl_->rgb_.copyTo(rgb_);
 		impl_->depth_.copyTo(depth_);
 	}
diff --git a/components/rgbd-sources/src/stereovideo.cpp b/components/rgbd-sources/src/stereovideo.cpp
index aa8f8c376296f24f5a8bcbf2a6b3dd5224305f9f..0d20780c275a43a2338fd9d81a6a98f13619c3a2 100644
--- a/components/rgbd-sources/src/stereovideo.cpp
+++ b/components/rgbd-sources/src/stereovideo.cpp
@@ -62,14 +62,14 @@ void StereoVideoSource::init(const string &file) {
 	if (!calib_->isCalibrated()) LOG(WARNING) << "Cameras are not calibrated!";
 
 	// Generate camera parameters from camera matrix
-	cv::Mat q = calib_->getCameraMatrix();
+	cv::Mat K = calib_->getCameraMatrix();
 	params_ = {
-		q.at<double>(0,0),	// Fx
-		q.at<double>(1,1),	// Fy
-		-q.at<double>(0,2),	// Cx
-		-q.at<double>(1,2),	// Cy
-		(unsigned int)lsrc_->width(),
-		(unsigned int)lsrc_->height(),
+		K.at<double>(0,0),	// Fx
+		K.at<double>(1,1),	// Fy
+		-K.at<double>(0,2),	// Cx
+		-K.at<double>(1,2),	// Cy
+		(unsigned int) lsrc_->width(),
+		(unsigned int) lsrc_->height(),
 		0.0f,	// 0m min
 		15.0f,	// 15m max
 		1.0 / calib_->getQ().at<double>(3,2), // Baseline
@@ -115,7 +115,7 @@ void StereoVideoSource::init(const string &file) {
 	mask_l_ = (mask_l == 0);
 	
 	disp_ = Disparity::create(host_, "disparity");
-    if (!disp_) LOG(FATAL) << "Unknown disparity algorithm : " << *host_->get<ftl::config::json_t>("disparity");
+	if (!disp_) LOG(FATAL) << "Unknown disparity algorithm : " << *host_->get<ftl::config::json_t>("disparity");
 	disp_->setMask(mask_l_);
 
 	LOG(INFO) << "StereoVideo source ready...";
@@ -153,11 +153,29 @@ static void disparityToDepth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat
 	cv::cuda::divide(val, disparity, depth, 1.0f / 1000.0f, -1, stream);
 }
 
-bool StereoVideoSource::grab(int n, int b) {	
+bool StereoVideoSource::capture() {
+	lsrc_->get(cap_left_, cap_right_, stream2_);
+	stream2_.waitForCompletion();
+	return true;
+}
+
+void StereoVideoSource::swap() {
+	cv::cuda::GpuMat tmp;
+	tmp = left_;
+	left_ = cap_left_;
+	cap_left_ = tmp;
+	tmp = right_;
+	right_ = cap_right_;
+	cap_right_ = tmp;
+}
+
+bool StereoVideoSource::compute(int n, int b) {	
 	const ftl::rgbd::channel_t chan = host_->getChannel();
 
+	if (left_.empty() || right_.empty()) return false;
+
 	if (chan == ftl::rgbd::kChanDepth) {
-		lsrc_->get(left_, right_, stream_);
+		//lsrc_->get(left_, right_, stream_);
 		if (depth_tmp_.empty()) depth_tmp_ = cv::cuda::GpuMat(left_.size(), CV_32FC1);
 		if (disp_tmp_.empty()) disp_tmp_ = cv::cuda::GpuMat(left_.size(), CV_32FC1);
 		calib_->rectifyStereo(left_, right_, stream_);
@@ -169,13 +187,13 @@ bool StereoVideoSource::grab(int n, int b) {
 
 		stream_.waitForCompletion();  // TODO:(Nick) Move to getFrames
 	} else if (chan == ftl::rgbd::kChanRight) {
-		lsrc_->get(left_, right_, stream_);
+		//lsrc_->get(left_, right_, stream_);
 		calib_->rectifyStereo(left_, right_, stream_);
 		left_.download(rgb_, stream_);
 		right_.download(depth_, stream_);
 		stream_.waitForCompletion();  // TODO:(Nick) Move to getFrames
 	} else {
-		lsrc_->get(left_, right_, stream_);
+		//lsrc_->get(left_, right_, stream_);
 		calib_->rectifyStereo(left_, right_, stream_);
 		//rgb_ = lsrc_->cachedLeft();
 		left_.download(rgb_, stream_);
diff --git a/components/rgbd-sources/src/stereovideo.hpp b/components/rgbd-sources/src/stereovideo.hpp
index 7835742389330046a33b7f22289ac36e8cdab4d5..9d01da4cdebe0ca2bb2a1e91832ca016f7116645 100644
--- a/components/rgbd-sources/src/stereovideo.hpp
+++ b/components/rgbd-sources/src/stereovideo.hpp
@@ -26,7 +26,9 @@ class StereoVideoSource : public detail::Source {
 	StereoVideoSource(ftl::rgbd::Source*, const std::string &);
 	~StereoVideoSource();
 
-	bool grab(int n, int b);
+	void swap();
+	bool capture();
+	bool compute(int n, int b);
 	bool isReady();
 	Camera parameters(channel_t chan);
 
@@ -40,9 +42,12 @@ class StereoVideoSource : public detail::Source {
 	bool ready_;
 	
 	cv::cuda::Stream stream_;
+	cv::cuda::Stream stream2_;
 
 	cv::cuda::GpuMat left_;
 	cv::cuda::GpuMat right_;
+	cv::cuda::GpuMat cap_left_;
+	cv::cuda::GpuMat cap_right_;
 	cv::cuda::GpuMat disp_tmp_;
 	cv::cuda::GpuMat depth_tmp_;
 	
diff --git a/components/rgbd-sources/src/streamer.cpp b/components/rgbd-sources/src/streamer.cpp
index c2c818b4979bcc86f2b0e8b04709e885f3cac0d4..dd4a93d696e7d1af9392ba0176b0f5206ea0d5fb 100644
--- a/components/rgbd-sources/src/streamer.cpp
+++ b/components/rgbd-sources/src/streamer.cpp
@@ -279,6 +279,7 @@ void Streamer::_swap(StreamSource *src) {
 		}
 
 		src->src->getFrames(src->rgb, src->depth);
+		src->src->swap();
 
 		//if (!src->rgb.empty() && src->prev_depth.empty()) {
 			//src->prev_depth = cv::Mat(src->rgb.size(), CV_16UC1, cv::Scalar(0));
@@ -324,14 +325,14 @@ void Streamer::_schedule() {
 
 		// There will be two jobs for this source...
 		//UNIQUE_LOCK(job_mtx_,lk);
-		jobs_ += 1 + kChunkCount;
+		jobs_ += 2 + kChunkCount;
 		//lk.unlock();
 
 		StreamSource *src = sources_[uri];
 		if (src == nullptr || src->jobs != 0) continue;
-		src->jobs = 1 + kChunkCount;
+		src->jobs = 2 + kChunkCount;
 
-		// Grab job
+		// Grab / capture job
 		ftl::pool.push([this,src](int id) {
 			//auto start = std::chrono::high_resolution_clock::now();
 
@@ -363,8 +364,7 @@ void Streamer::_schedule() {
 			last_frame_ = now/mspf_;
 
 			try {
-				// FIXME: Grab should do the blocking, not some other thread
-				src->src->grab();
+				src->src->capture();
 			} catch (std::exception &ex) {
 				LOG(ERROR) << "Exception when grabbing frame";
 				LOG(ERROR) << ex.what();
@@ -382,6 +382,27 @@ void Streamer::_schedule() {
 			if (jobs_ == 0) job_cv_.notify_one();
 		});
 
+		// Compute job
+		ftl::pool.push([this,src](int id) {
+			try {
+				src->src->compute();
+			} catch (std::exception &ex) {
+				LOG(ERROR) << "Exception when computing frame";
+				LOG(ERROR) << ex.what();
+			}
+			catch (...) {
+				LOG(ERROR) << "Unknown exception when computing frame";
+			}
+
+			src->jobs--;
+			_swap(src);
+
+			// Mark job as finished
+			std::unique_lock<std::mutex> lk(job_mtx_);
+			--jobs_;
+			job_cv_.notify_one();
+		});
+
 		// Create jobs for each chunk
 		for (int i=0; i<kChunkCount; ++i) {
 			// Add chunk job to thread pool
diff --git a/components/rgbd-sources/test/source_unit.cpp b/components/rgbd-sources/test/source_unit.cpp
index b27b72e070f6e2116632b967699243a9e80926d8..1c80e8292ff27e3c2652e7453dff59a1acea7bda 100644
--- a/components/rgbd-sources/test/source_unit.cpp
+++ b/components/rgbd-sources/test/source_unit.cpp
@@ -26,7 +26,7 @@ class ImageSource : public ftl::rgbd::detail::Source {
 		last_type = "image";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };
 
@@ -39,7 +39,7 @@ class StereoVideoSource : public ftl::rgbd::detail::Source {
 		last_type = "video";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };
 
@@ -49,7 +49,7 @@ class NetSource : public ftl::rgbd::detail::Source {
 		last_type = "net";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };
 
@@ -59,7 +59,7 @@ class SnapshotSource : public ftl::rgbd::detail::Source {
 		last_type = "snapshot";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };
 
@@ -69,7 +69,7 @@ class RealsenseSource : public ftl::rgbd::detail::Source {
 		last_type = "realsense";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };
 
@@ -79,7 +79,7 @@ class MiddleburySource : public ftl::rgbd::detail::Source {
 		last_type = "middlebury";
 	}
 
-	bool grab(int n, int b) { return true; };
+	bool compute(int n, int b) { return true; };
 	bool isReady() { return true; };
 };