diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8db5a8886616aa2b6d829e9090d40cbd973e4d88..6cb60d3926d30ee89edf8b98d5480d1383ff950d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,8 +96,8 @@ check_language(CUDA)
 if (CUDA_TOOLKIT_ROOT_DIR)
 enable_language(CUDA)
 set(CMAKE_CUDA_FLAGS "")
-set(CMAKE_CUDA_FLAGS_DEBUG "-g -DDEBUG -D_DEBUG")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "--gpu-architecture=compute_61 -g -DDEBUG -D_DEBUG")
+set(CMAKE_CUDA_FLAGS_RELEASE "--gpu-architecture=compute_61")
 set(HAVE_CUDA TRUE)
 include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt
index 00e44a444ba45f6cb7b4f68ed2441a1c3ac4fd4c..ae06aab6d142ba8a2ad3a1e0970905e3a70468e9 100644
--- a/applications/reconstruct/CMakeLists.txt
+++ b/applications/reconstruct/CMakeLists.txt
@@ -19,6 +19,8 @@ set(REPSRC
 	#src/virtual_source.cpp
 	src/splat_render.cpp
 	src/dibr.cu
+	src/depth_camera.cu
+	src/depth_camera.cpp
 )
 
 add_executable(ftl-reconstruct ${REPSRC})
diff --git a/applications/reconstruct/include/ftl/depth_camera.hpp b/applications/reconstruct/include/ftl/depth_camera.hpp
index f40581b65cd71b9f239b2c0e82dfd616deb7ff28..83d9e1bcbedf42025f569068332008251e5651b9 100644
--- a/applications/reconstruct/include/ftl/depth_camera.hpp
+++ b/applications/reconstruct/include/ftl/depth_camera.hpp
@@ -23,7 +23,10 @@ namespace voxhash {
 
 struct __align__(16) DepthCameraCUDA {
 	cudaTextureObject_t depth;
+	cudaTextureObject_t depth2;
+	cudaTextureObject_t points;
 	cudaTextureObject_t colour;
+	cudaTextureObject_t normal;
 	DepthCameraParams params;
 	float4x4 pose;
 	float4x4 poseInverse;
@@ -35,58 +38,23 @@ struct DepthCamera {
 	// Host part //
 	///////////////
 
-	__host__
-	DepthCamera() {
-		/*d_depthData = NULL;
-		d_colorData = NULL;
-		d_depthArray = NULL;
-		d_colorArray = NULL;*/
-
-		depth_mat_ = nullptr;
-		colour_mat_ = nullptr;
-		depth_tex_ = nullptr;
-		colour_tex_ = nullptr;
-	}
-
-	__host__
-	void alloc(const DepthCameraParams& params) { //! todo resizing???
-		depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1);
-		colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4);
-		depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_);
-		colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_);
-		data.depth = depth_tex_->cudaTexture();
-		data.colour = colour_tex_->cudaTexture();
-		data.params = params;
-	}
-
-	//__host__
-	//void updateParams(const DepthCameraParams& params) {
-	//	updateConstantDepthCameraParams(params);
-	//}
-
-	__host__
-	void updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream) {
-		depth_mat_->upload(depth, stream);
-		colour_mat_->upload(rgb, stream);
-	}
-
-	__host__
-	void free() {
-		if (depth_mat_) delete depth_mat_;
-		if (colour_mat_) delete colour_mat_;
-		delete depth_tex_;
-		delete colour_tex_;
-	}
-
-
-	// TODO(Nick) Should not need to pass all these pointers to device
-	cv::cuda::GpuMat *depth_mat_;
-	cv::cuda::GpuMat *colour_mat_;
+	__host__ DepthCamera();
+
+	__host__ void alloc(const DepthCameraParams& params, bool withNormals=false);
+
+	__host__ void updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream);
+
+	__host__ void free();
+
+	__host__ void _computeNormals(cudaStream_t stream);
+
 	ftl::cuda::TextureObject<float> *depth_tex_;
+	ftl::cuda::TextureObject<int> *depth2_tex_;
+	ftl::cuda::TextureObject<float4> *points_tex_;
 	ftl::cuda::TextureObject<uchar4> *colour_tex_;
-	//cudaTextureObject_t depth_obj_;
-	//cudaTextureObject_t colour_obj_;
+	ftl::cuda::TextureObject<float4> *normal_tex_;
 
+	// This part is sent to device
 	DepthCameraCUDA data;
 };
 }
diff --git a/applications/reconstruct/include/ftl/voxel_hash.hpp b/applications/reconstruct/include/ftl/voxel_hash.hpp
index c0a228834337e60f5807313735eedee25eb51dcb..98c2eca90530c309b5d2e46848493634aaaa053c 100644
--- a/applications/reconstruct/include/ftl/voxel_hash.hpp
+++ b/applications/reconstruct/include/ftl/voxel_hash.hpp
@@ -37,7 +37,7 @@ typedef signed char schar;
 #include <ftl/depth_camera.hpp>
 
 #define SDF_BLOCK_SIZE 8
-#define SDF_BLOCK_SIZE_OLAP 7
+#define SDF_BLOCK_SIZE_OLAP 8
 
 #ifndef MINF
 #define MINF __int_as_float(0xff800000)
@@ -73,6 +73,7 @@ struct __align__(16) HashEntry
 {
 	HashEntryHead head;
 	uint voxels[16];  // 512 bits, 1 bit per voxel
+	//uint validity[16];  // Is the voxel valid, 512 bit
 	
 	/*__device__ void operator=(const struct HashEntry& e) {
 		((long long*)this)[0] = ((const long long*)&e)[0];
@@ -270,11 +271,13 @@ struct HashData {
 	}
 
 	__device__
-	bool isSDFBlockInCameraFrustumApprox(const HashParams &hashParams, const DepthCameraParams &camera, const int3& sdfBlock) {
+	bool isInBoundingBox(const HashParams &hashParams, const int3& sdfBlock) {
 		// NOTE (Nick): Changed, just assume all voxels are potentially in frustrum
 		//float3 posWorld = virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)) + hashParams.m_virtualVoxelSize * 0.5f * (SDF_BLOCK_SIZE - 1.0f);
 		//return camera.isInCameraFrustumApprox(hashParams.m_rigidTransformInverse, posWorld);
-		return true;
+		return !(hashParams.m_flags & ftl::voxhash::kFlagClipping) || sdfBlock.x > hashParams.m_minBounds.x && sdfBlock.x < hashParams.m_maxBounds.x &&
+			sdfBlock.y > hashParams.m_minBounds.y && sdfBlock.y < hashParams.m_maxBounds.y &&
+			sdfBlock.z > hashParams.m_minBounds.z && sdfBlock.z < hashParams.m_maxBounds.z;
 	}
 
 	//! computes the (local) virtual voxel pos of an index; idx in [0;511]
diff --git a/applications/reconstruct/include/ftl/voxel_hash_params.hpp b/applications/reconstruct/include/ftl/voxel_hash_params.hpp
index cb94404c3602cd237fed4bd683e7196b3adf4ab6..480e16d478a7a3c82d046f6de464d7bb20c04f64 100644
--- a/applications/reconstruct/include/ftl/voxel_hash_params.hpp
+++ b/applications/reconstruct/include/ftl/voxel_hash_params.hpp
@@ -12,35 +12,29 @@
 namespace ftl {
 namespace voxhash {
 
+static const unsigned int kFlagClipping = 0x00000001;
+static const unsigned int kFlagMLS = 0x00000002;
+
 //TODO might have to be split into static and dynamics
 struct __align__(16) HashParams {
 	HashParams() {
 	}
 
-	float4x4		m_rigidTransform;
-	float4x4		m_rigidTransformInverse;
-
 	unsigned int	m_hashNumBuckets;
-	unsigned int	m_deprecated1;
-	unsigned int	m_deprecated2; //m_hashMaxCollisionLinkedListSize;
-	unsigned int	m_numSDFBlocks;
-
-	int				m_SDFBlockSize;
 	float			m_virtualVoxelSize;
-	unsigned int	m_numOccupiedBlocks;	//occupied blocks in the viewing frustum
-	
 	float			m_maxIntegrationDistance;
 	float			m_truncScale;
 	float			m_truncation;
 	unsigned int	m_integrationWeightSample;
 	unsigned int	m_integrationWeightMax;
 
-	float3			m_streamingVoxelExtents;
-	int3			m_streamingGridDimensions;
-	int3			m_streamingMinGridPos;
-	unsigned int	m_streamingInitialChunkListSize;
-	uint2			m_dummy;
+	int3 m_minBounds;
+	int3 m_maxBounds;
+	float m_spatialSmoothing;
+	float m_colourSmoothing;
+	float m_confidenceThresh;
 
+	unsigned int m_flags;
 };
 
 }  // namespace voxhash
diff --git a/applications/reconstruct/src/depth_camera.cpp b/applications/reconstruct/src/depth_camera.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a5291f932ce0681a8d36aa9ebcfb08bc292a411
--- /dev/null
+++ b/applications/reconstruct/src/depth_camera.cpp
@@ -0,0 +1,54 @@
+#include <ftl/depth_camera.hpp>
+#include "depth_camera_cuda.hpp"
+#include <opencv2/core/cuda_stream_accessor.hpp>
+
+using ftl::voxhash::DepthCamera;
+using ftl::voxhash::DepthCameraCUDA;
+
+DepthCamera::DepthCamera() {
+	depth_tex_ = nullptr;
+	depth2_tex_ = nullptr;
+	points_tex_ = nullptr;
+	colour_tex_ = nullptr;
+	normal_tex_ = nullptr;
+}
+
+void DepthCamera::alloc(const DepthCameraParams& params, bool withNormals) { //! todo resizing???
+	depth_tex_ = new ftl::cuda::TextureObject<float>(params.m_imageWidth, params.m_imageHeight);
+	depth2_tex_ = new ftl::cuda::TextureObject<int>(params.m_imageWidth, params.m_imageHeight);
+	points_tex_ = new ftl::cuda::TextureObject<float4>(params.m_imageWidth, params.m_imageHeight);
+	colour_tex_ = new ftl::cuda::TextureObject<uchar4>(params.m_imageWidth, params.m_imageHeight);
+	data.depth = depth_tex_->cudaTexture();
+	data.depth2 = depth2_tex_->cudaTexture();
+	data.points = points_tex_->cudaTexture();
+	data.colour = colour_tex_->cudaTexture();
+	data.params = params;
+
+	if (withNormals) {
+		normal_tex_ = new ftl::cuda::TextureObject<float4>(params.m_imageWidth, params.m_imageHeight);
+		data.normal = normal_tex_->cudaTexture();
+	} else {
+		data.normal = 0;
+	}
+}
+
+void DepthCamera::free() {
+	delete depth_tex_;
+	delete colour_tex_;
+	delete depth2_tex_;
+	delete points_tex_;
+	if (normal_tex_) delete normal_tex_;
+}
+
+void DepthCamera::updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream) {
+	depth_tex_->upload(depth, cv::cuda::StreamAccessor::getStream(stream));
+	colour_tex_->upload(rgb, cv::cuda::StreamAccessor::getStream(stream));
+	//if (normal_mat_) {
+	//	_computeNormals(cv::cuda::StreamAccessor::getStream(stream));
+	//}
+}
+
+void DepthCamera::_computeNormals(cudaStream_t stream) {
+	//ftl::cuda::point_cloud((float3*)point_mat_->data, data, stream);
+	//ftl::cuda::compute_normals((float3*)point_mat_->data, normal_tex_, stream);
+}
diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8490c38b2422bdd1834244bf5acf974a875b3a1e
--- /dev/null
+++ b/applications/reconstruct/src/depth_camera.cu
@@ -0,0 +1,598 @@
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include "depth_camera_cuda.hpp"
+
+#define T_PER_BLOCK 16
+#define MINF __int_as_float(0xff800000)
+
+using ftl::voxhash::DepthCameraCUDA;
+using ftl::voxhash::HashData;
+using ftl::voxhash::HashParams;
+using ftl::cuda::TextureObject;
+using ftl::render::SplatParams;
+
+extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+extern __constant__ HashParams c_hashParams;
+
+__global__ void clear_depth_kernel(ftl::cuda::TextureObject<float> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = 1000.0f; //PINF;
+		//colour(x,y) = make_uchar4(76,76,82,0);
+	}
+}
+
+void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
+__global__ void clear_depth_kernel(ftl::cuda::TextureObject<int> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = 0x7FFFFFFF; //PINF;
+		//colour(x,y) = make_uchar4(76,76,82,0);
+	}
+}
+
+void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
+__global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = make_float4(MINF,MINF,MINF,MINF);
+		//colour(x,y) = make_uchar4(76,76,82,0);
+	}
+}
+
+void ftl::cuda::clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_points_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
+__global__ void clear_colour_kernel(ftl::cuda::TextureObject<uchar4> depth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < depth.width() && y < depth.height()) {
+		depth(x,y) = make_uchar4(76,76,76,76);
+		//colour(x,y) = make_uchar4(76,76,82,0);
+	}
+}
+
+void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream) {
+	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth);
+}
+
+// ===== Type convert =====
+
+template <typename A, typename B>
+__global__ void convert_kernel(const ftl::cuda::TextureObject<A> in, ftl::cuda::TextureObject<B> out, float scale) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	if (x < in.width() && y < in.height()) {
+		out(x,y) = ((float)in.tex2D((int)x,(int)y)) * scale;
+	}
+}
+
+void ftl::cuda::float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureObject<int> &out, float scale, cudaStream_t stream) {
+	const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convert_kernel<float,int><<<gridSize, blockSize, 0, stream>>>(in, out, scale);
+}
+
+void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, float scale, cudaStream_t stream) {
+	const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale);
+}
+
+/// ===== MLS Smooth
+
+// TODO:(Nick) Put this in a common location (used in integrators.cu)
+extern __device__ float spatialWeighting(float r);
+
+/*
+ * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
+ * Real-time foreground-background segmentation using codebook model.
+ * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
+ */
+ __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) {
+	const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
+	const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
+	const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2);
+	const float p_2 = xv_2 / v_2;
+	return sqrt(x_2 - p_2);
+}
+
+__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) {
+	float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z);
+	return length(delta);
+}
+
+/*
+ * Colour weighting as suggested in:
+ * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
+ * c = colour distance
+ */
+ __device__ float colourWeighting(float c) {
+	const float h = c_hashParams.m_colourSmoothing;
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
+#define WINDOW_RADIUS 9
+
+__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+				float weight = spatialWeighting(length(pf - camPos));
+
+				if (weight > 0.0f) {
+					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+					weight *= colourWeighting(colordiffFloat2(c1,c2));
+
+					if (weight > 0.0f) {
+						wpos += weight* (camera.pose * camPos);
+						weights += weight;
+					}
+				}			
+			//}
+		}
+	}
+
+	//wpos += (camera.pose * pos);
+
+	return weights;
+}
+
+__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+	//#pragma unroll
+	for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+		for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+			//if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+				float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+				const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+				float weight = spatialWeighting(length(pf - camPos));
+
+				if (weight > 0.0f) {
+					uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v);
+					weight *= colourWeighting(colordiffFloat2(c1,c2));
+
+					if (weight > weights) {
+						pos = weight* (camera.pose * camPos);
+						weights = weight;
+					}
+				}			
+			//}
+		}
+	}
+
+	wpos += pos;
+	//wpos += (camera.pose * pos);
+
+	return weights;
+}
+
+__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) {
+	const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+
+	const float3 pf = camera.poseInverse * mPos;
+	float3 pos = make_float3(0.0f, 0.0f, 0.0f);
+	const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+	float weights = 0.0f;
+
+
+	//float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y);
+	const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y));
+	if (worldPos.z == MINF) return 0.0f;
+
+	float weight = spatialWeighting(length(mPos - worldPos));
+
+	if (weight > 0.0f) {
+		wpos += weight* (worldPos);
+		weights += weight;
+	}
+
+	return weights;
+}
+
+__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+	const int height = output.height();
+
+	const DepthCameraCUDA &mainCamera = c_cameras[cam];
+
+	if (x < width && y < height) {
+
+		const float depth = tex2D<float>(mainCamera.depth, x, y);
+		const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y);
+
+		float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		float weights = 0.0f;
+
+		if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) {
+			float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth);
+
+			if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x &&
+					mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y &&
+					mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) {
+
+				if (hashParams.m_flags & ftl::voxhash::kFlagMLS) {
+					for (uint cam2=0; cam2<numcams; ++cam2) {
+						if (cam2 == cam) continue; //weights += mlsCamera(cam2, mPos, c1, wpos);
+						weights += mlsCameraBest(cam2, mPos, c1, wpos);
+					}
+					wpos /= weights;
+				} else {
+					weights = 1000.0f;
+					wpos = mPos;
+				} 
+
+				//output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+
+				const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos));
+				if (screenPos.x < output.width() && screenPos.y < output.height()) {
+					output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF);
+				}
+			}
+		}
+	}
+}
+
+void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) {
+	const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+#define RESAMPLE_RADIUS 7
+
+__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = depthin.width();
+	const int height = depthin.height();
+
+	if (x < width && y < height) {
+
+		//const int depth = depthin.tex2D((int)x, (int)y);
+		//if (depth != 0x7FFFFFFF) {
+		//	depthout(x,y) = (float)depth / 1000.0f;
+		//	return;
+		//}
+
+		struct map_t {
+			int d;
+			int quad;
+		};
+
+		map_t mappings[5];
+		int mapidx = 0;
+
+		for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) {
+			for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) {
+
+				const int depth = depthin.tex2D((int)x+u, (int)y+v);
+				const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v);
+				
+				if (depth != 0x7FFFFFFF) {
+					int i=0;
+					for (i=0; i<mapidx; ++i) {
+						if (abs(mappings[i].d - depth) < 100) {
+							if (u < 0 && v < 0) mappings[i].quad |= 0x1;
+							if (u > 0 && v < 0) mappings[i].quad |= 0x2;
+							if (u > 0 && v > 0) mappings[i].quad |= 0x4;
+							if (u < 0 && v > 0) mappings[i].quad |= 0x8;
+							break;
+						}
+					}
+					if (i == mapidx && i < 5) {
+						mappings[mapidx].d = depth;
+						mappings[mapidx].quad = 0;
+						if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1;
+						if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2;
+						if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4;
+						if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8;
+						++mapidx;
+					} else {
+						//printf("EXCEEDED\n");
+					}
+				}
+			}
+		}
+
+		int bestdepth = 1000000;
+		//int count = 0;
+		for (int i=0; i<mapidx; ++i) {
+			if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
+			//if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d;
+			//if (mappings[i].quad == 15) count ++;
+		}
+
+		//depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f;
+
+		if (bestdepth < 1000000) {
+			depthout(x,y) = (float)bestdepth / 1000.0f;
+		}
+	}
+}
+
+void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams &params, cudaStream_t stream) {
+	const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+
+/// ===== Median Filter ======
+
+#define WINDOW_SIZE 3
+#define MEDIAN_RADIUS 3
+#define MEDIAN_SIZE (((MEDIAN_RADIUS*2)+1)*((MEDIAN_RADIUS*2)+1))
+
+__global__ void medianFilterKernel(TextureObject<int> inputImageKernel, TextureObject<float> outputImagekernel)
+{
+	// Set row and colum for thread.
+	int row = blockIdx.y * blockDim.y + threadIdx.y;
+	int col = blockIdx.x * blockDim.x + threadIdx.x;
+	int filterVector[MEDIAN_SIZE] = {0};   //Take fiter window
+	if((row<=MEDIAN_RADIUS) || (col<=MEDIAN_RADIUS) || (row>=inputImageKernel.height()-MEDIAN_RADIUS) || (col>=inputImageKernel.width()-MEDIAN_RADIUS))
+				outputImagekernel(col, row) = 0.0f; //Deal with boundry conditions
+	else {
+		for (int v = -MEDIAN_RADIUS; v <= MEDIAN_RADIUS; v++) { 
+			for (int u = -MEDIAN_RADIUS; u <= MEDIAN_RADIUS; u++){
+				filterVector[(v+MEDIAN_RADIUS)*(2*MEDIAN_RADIUS+1)+u+MEDIAN_RADIUS] = inputImageKernel((col+u), (row+v));   // setup the filterign window.
+			}
+		}
+		for (int i = 0; i < MEDIAN_SIZE; i++) {
+			for (int j = i + 1; j < MEDIAN_SIZE; j++) {
+				if (filterVector[i] > filterVector[j]) { 
+					//Swap the variables.
+					char tmp = filterVector[i];
+					filterVector[i] = filterVector[j];
+					filterVector[j] = tmp;
+				}
+			}
+		}
+		outputImagekernel(col, row) = (float)filterVector[MEDIAN_SIZE/2+1] / 1000.0f;   //Set the output variables.
+	}
+}
+
+void ftl::cuda::median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream) {
+	const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	medianFilterKernel<<<gridSize, blockSize, 0, stream>>>(in, out);
+}
+
+
+/// ===== Hole Fill =====
+
+__device__ inline float distance2(float3 a, float3 b) {
+	const float x = a.x-b.x;
+	const float y = a.y-b.y;
+	const float z = a.z-b.z;
+	return x*x+y*y+z*z;
+}
+
+#define SPLAT_RADIUS 7
+#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1)
+#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS)
+
+__global__ void hole_fill_kernel(
+	TextureObject<int> depth_in,
+	TextureObject<float> depth_out, DepthCameraParams params) {
+	// Read an NxN region and
+	// - interpolate a depth value for this pixel
+	// - interpolate an rgb value for this pixel
+	// Must respect depth discontinuities.
+	// How much influence a given neighbour has depends on its depth value
+
+	__shared__ float3 positions[SPLAT_BUFFER_SIZE];
+
+	const float voxelSize = c_hashParams.m_virtualVoxelSize;
+
+	const int i = threadIdx.y*blockDim.y + threadIdx.x;
+	const int bx = blockIdx.x*blockDim.x;
+	const int by = blockIdx.y*blockDim.y;
+	const int x = bx + threadIdx.x;
+	const int y = by + threadIdx.y;
+
+	// const float camMinDepth = params.camera.m_sensorDepthWorldMin;
+	// const float camMaxDepth = params.camera.m_sensorDepthWorldMax;
+
+	for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) {
+		const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS;
+		const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS;
+		if (sx >= depth_in.width() || sy >= depth_in.height()) {
+			positions[j] = make_float3(1000.0f,1000.0f, 1000.0f);
+		} else {
+			positions[j] = params.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f);
+		}
+	}
+
+	__syncthreads();
+
+	if (x >= depth_in.width() || y >= depth_in.height()) return;
+
+	const float voxelSquared = voxelSize*voxelSize;
+	float mindepth = 1000.0f;
+	//int minidx = -1;
+	// float3 minpos;
+
+	//float3 validPos[MAX_VALID];
+	//int validIndices[MAX_VALID];
+	//int validix = 0;
+
+	for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) {
+		for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) {
+			//const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u;
+			const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS;
+
+			float3 posp = positions[idx];
+			const float d = posp.z;
+			//if (d < camMinDepth || d > camMaxDepth) continue;
+
+			float3 pos = params.kinectDepthToSkeleton(x, y, d);
+			float dist = distance2(pos, posp);
+
+			if (dist < voxelSquared) {
+				// Valid so check for minimum
+				//validPos[validix] = pos;
+				//validIndices[validix++] = idx;
+				if (d < mindepth) {
+					mindepth = d;
+					//minidx = idx;
+					// minpos = pos;
+				}
+			}
+		}
+	}
+
+	depth_out(x,y) = mindepth;
+}
+
+void ftl::cuda::hole_fill(const TextureObject<int> &depth_in,
+	const TextureObject<float> &depth_out, const DepthCameraParams &params, cudaStream_t stream) 
+{
+
+	const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	hole_fill_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params);
+	cudaSafeCall( cudaGetLastError() );
+
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	#endif
+}
+
+
+/// ===== Point cloud from depth =====
+
+__global__ void point_cloud_kernel(float3* output, DepthCameraCUDA depthCameraData)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = depthCameraData.params.m_imageWidth;
+	const int height = depthCameraData.params.m_imageHeight;
+
+	if (x < width && y < height) {
+		float depth = tex2D<float>(depthCameraData.depth, x, y);
+
+		output[y*width+x] = (depth >= depthCameraData.params.m_sensorDepthWorldMin && depth <= depthCameraData.params.m_sensorDepthWorldMax) ?
+			depthCameraData.params.kinectDepthToSkeleton(x, y, depth) :
+			make_float3(MINF, MINF, MINF);
+	}
+}
+
+void ftl::cuda::point_cloud(float3* output, const DepthCameraCUDA &depthCameraData, cudaStream_t stream) {
+	const dim3 gridSize((depthCameraData.params.m_imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthCameraData.params.m_imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	point_cloud_kernel<<<gridSize, blockSize, 0, stream>>>(output, depthCameraData);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
+
+/// ===== NORMALS =====
+
+
+__global__ void compute_normals_kernel(const float3 *input, ftl::cuda::TextureObject<float4> output)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const int width = output.width();
+
+	if(x >= output.width() || y >= output.height()) return;
+
+	output(x,y) = make_float4(MINF, MINF, MINF, MINF);
+
+	if(x > 0 && x < output.width()-1 && y > 0 && y < output.height()-1)
+	{
+		// TODO:(Nick) Should use a 7x7 window
+		const float3 CC = input[(y+0)*width+(x+0)];
+		const float3 PC = input[(y+1)*width+(x+0)];
+		const float3 CP = input[(y+0)*width+(x+1)];
+		const float3 MC = input[(y-1)*width+(x+0)];
+		const float3 CM = input[(y+0)*width+(x-1)];
+
+		if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF)
+		{
+			const float3 n = cross(PC-MC, CP-CM);
+			//const float  l = length(n);
+
+			//if(l > 0.0f)
+			//{
+				output(x,y) = make_float4(n, 1.0f); //make_float4(n/-l, 1.0f);
+			//}
+		}
+	}
+}
+
+void ftl::cuda::compute_normals(const float3 *input, ftl::cuda::TextureObject<float4> *output, cudaStream_t stream) {
+	const dim3 gridSize((output->width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output->height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	compute_normals_kernel<<<gridSize, blockSize, 0, stream>>>(input, *output);
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	//cutilCheckMsg(__FUNCTION__);
+#endif
+}
\ No newline at end of file
diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a55bd53ad5b3cca95bbb1be6f3547f020ed56f2f
--- /dev/null
+++ b/applications/reconstruct/src/depth_camera_cuda.hpp
@@ -0,0 +1,35 @@
+#ifndef _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_
+#define _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_
+
+#include <ftl/depth_camera.hpp>
+#include <ftl/voxel_hash.hpp>
+#include "splat_params.hpp"
+
+namespace ftl {
+namespace cuda {
+
+void clear_depth(const TextureObject<float> &depth, cudaStream_t stream);
+void clear_depth(const TextureObject<int> &depth, cudaStream_t stream);
+void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream);
+void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream);
+
+void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream);
+
+void int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, float scale, cudaStream_t stream);
+
+void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureObject<int> &out, float scale, cudaStream_t stream);
+
+void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream);
+
+void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams &params, cudaStream_t stream);
+
+void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams &params, cudaStream_t stream);
+
+void point_cloud(float3* output, const ftl::voxhash::DepthCameraCUDA &depthCameraData, cudaStream_t stream);
+
+void compute_normals(const float3 *points, ftl::cuda::TextureObject<float4> *normals, cudaStream_t stream);
+
+}
+}
+
+#endif  // _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_
diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu
index 575e3c45f9348c37c8397683db36d87edda677e4..9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f 100644
--- a/applications/reconstruct/src/dibr.cu
+++ b/applications/reconstruct/src/dibr.cu
@@ -23,8 +23,54 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) {
 	}
 }
 
+
+__global__ void dibr_depthmap_kernel(
+    TextureObject<int> depth, int numcams, SplatParams params) {
+
+    const int i = threadIdx.y*blockDim.y + threadIdx.x;
+    const int bx = blockIdx.x*blockDim.x;
+    const int by = blockIdx.y*blockDim.y;
+    const int x = bx + threadIdx.x;
+    const int y = by + threadIdx.y;
+    
+    for (int j=0; j<numcams; ++j) {
+        const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
+	
+		float4 d = tex2D<float4>(camera.points, x, y);
+		if (d.z < 0.0f) continue;
+        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
+        
+        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
+        
+        const float3 worldPos = make_float3(d);
+        const float3 camPos = params.m_viewMatrix * worldPos;
+	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
+
+        if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
+
+        const unsigned int cx = screenPos.x;
+        const unsigned int cy = screenPos.y;
+
+
+        if (cx < depth.width() && cy < depth.height()) {
+            //float camd = depth_in.tex2D((int)cx,(int)cy);
+            //atomicMin(&depth(x,y), idepth);
+            //float camdiff = fabs(camPos.z-camd);
+            //if (camdiff < 0.1f) {
+           		//colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
+            //} else {
+				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
+            //}
+            
+            atomicMin(&depth(cx,cy), camPos.z * 1000.0f);
+        }
+    }
+}
+
+
 __global__ void dibr_kernel(
-    TextureObject<float> depth_in,
+    TextureObject<int> depth_in,
     TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
 
     const int i = threadIdx.y*blockDim.y + threadIdx.x;
@@ -36,12 +82,13 @@ __global__ void dibr_kernel(
     for (int j=0; j<numcams; ++j) {
         const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
 	
-		float d = tex2D<float>(camera.depth, x, y);
-		if (d < 0.01f)	continue;
-        if (d >= params.camera.m_sensorDepthWorldMax) continue;
+		float4 d = tex2D<float4>(camera.points, x, y);
+		if (d.z < 0.0f) continue;
+        //if (d >= params.camera.m_sensorDepthWorldMax) continue;
         
-        const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
+        //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d);
         
+        const float3 worldPos = make_float3(d);
         const float3 camPos = params.m_viewMatrix * worldPos;
 	    const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
 	    const uint2 screenPos = make_uint2(make_int2(screenPosf));
@@ -53,18 +100,28 @@ __global__ void dibr_kernel(
 
 
         if (cx < colour_out.width() && cy < colour_out.height()) {
-            float camd = depth_in.tex2D((int)cx,(int)cy);
+            //float camd = depth_in.tex2D((int)cx,(int)cy);
             //atomicMin(&depth(x,y), idepth);
-            float camdiff = fabs(camPos.z-camd);
-            if (camdiff < 0.1f) {
-           		colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
-            } else {
+            //float camdiff = fabs(camPos.z-camd);
+            //if (camdiff < 0.1f) {
+
+            if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) {
+				   colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y);
+				   //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255);
+            }
+                   
+
+            //} else {
 				//colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255);
-			}
+			//}
         }
     }
 }
 
+__device__ inline float4 make_float4(const uchar4 &c) {
+    return make_float4(c.x,c.y,c.z,c.w);
+}
+
 __global__ void dibr_kernel_rev(
     TextureObject<float> depth_in,
     TextureObject<uchar4> colour_out, int numcams, SplatParams params) {
@@ -80,7 +137,10 @@ __global__ void dibr_kernel_rev(
 	if (camd >= params.camera.m_sensorDepthWorldMax) return;
 	
 	const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd);
-    
+    float mindiff = 1000.0f;
+    float4 col = make_float4(0.0f,0.0f,0.0f,0.0f);
+    int count = 0;
+
     for (int j=0; j<numcams; ++j) {
 		const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j];
 		
@@ -95,24 +155,67 @@ __global__ void dibr_kernel_rev(
 
         if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) {
 			float d = tex2D<float>(camera.depth, (int)cx, (int)cy);
-			float camdiff = fabs(camPos.z-d);
-            if (camdiff < 0.1f) {
-            	colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy);
-            } else {
+            float camdiff = fabs(camPos.z-d);
+            
+            if (camdiff < mindiff) {
+                mindiff = camdiff;
+                col += make_float4(tex2D<uchar4>(camera.colour,cx,cy));
+                ++count;
+            }
+
+            //if (camdiff < 0.1f) {
+            //	colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy);
+            //} else {
 				//colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255);
-			}
+			//}
 		}
     }
+
+    if (count > 0) {
+        col = col / (float)count;
+        colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255);
+    } else {
+        colour_out(x,y) = make_uchar4(76,76,76,255);
+    }
+}
+
+void ftl::cuda::dibr(const TextureObject<int> &depth_out,
+    const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
+
+    const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+	
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+
+    dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params);
+    dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+	cudaSafeCall( cudaGetLastError() );
+	
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 }
 
-void ftl::cuda::dibr(const TextureObject<float> &depth_in,
+void ftl::cuda::dibr(const TextureObject<float> &depth_out,
     const TextureObject<uchar4> &colour_out, int numcams, const SplatParams &params, cudaStream_t stream) {
 
-    const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-    clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+	clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out);
+	
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 
-    dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_in, colour_out, numcams, params);
-    cudaSafeCall( cudaGetLastError() );
+    dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params);
+	cudaSafeCall( cudaGetLastError() );
+	
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 }
diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu
index 1259210b4ea1b292f1adf00dfd0517bf04734c40..ae84d964af63d4a1d0eb5dc800b59939792ec8bc 100644
--- a/applications/reconstruct/src/integrators.cu
+++ b/applications/reconstruct/src/integrators.cu
@@ -7,6 +7,8 @@
 #include <ftl/cuda_common.hpp>
 
 #define T_PER_BLOCK 8
+#define NUM_CUDA_BLOCKS		10000
+#define WARP_SIZE 32
 
 using ftl::voxhash::HashData;
 using ftl::voxhash::HashParams;
@@ -15,45 +17,13 @@ using ftl::voxhash::HashEntry;
 using ftl::voxhash::HashEntryHead;
 using ftl::voxhash::FREE_ENTRY;
 
+extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+extern __constant__ HashParams c_hashParams;
+
 __device__ float4 make_float4(uchar4 c) {
 	return make_float4(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z), static_cast<float>(c.w));
 }
 
-inline __device__ uchar4 bilinearFilterColor(const DepthCameraParams& cameraParams, const float2& screenPos, cudaTextureObject_t colorTextureRef) {
-	//const DepthCameraParams& cameraParams = c_depthCameraParams;
-	const int imageWidth = cameraParams.m_imageWidth;
-	const int imageHeight = cameraParams.m_imageHeight;
-	const int2 p00 = make_int2(screenPos.x+0.5f, screenPos.y+0.5f);
-	const int2 dir = sign(make_float2(screenPos.x - p00.x, screenPos.y - p00.y));
-
-	const int2 p01 = p00 + make_int2(0.0f, dir.y);
-	const int2 p10 = p00 + make_int2(dir.x, 0.0f);
-	const int2 p11 = p00 + make_int2(dir.x, dir.y);
-
-	const float alpha = (screenPos.x - p00.x)*dir.x;
-	const float beta  = (screenPos.y - p00.y)*dir.y;
-
-	float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f;
-	if(p00.x >= 0 && p00.x < imageWidth && p00.y >= 0 && p00.y < imageHeight) { uchar4 v00 = tex2D<uchar4>(colorTextureRef, p00.x, p00.y); if(v00.x != 0) { s0 += (1.0f-alpha)*make_float4(v00); w0 += (1.0f-alpha); } }
-	if(p10.x >= 0 && p10.x < imageWidth && p10.y >= 0 && p10.y < imageHeight) { uchar4 v10 = tex2D<uchar4>(colorTextureRef, p10.x, p10.y); if(v10.x != 0) { s0 +=		 alpha *make_float4(v10); w0 +=		 alpha ; } }
-
-	float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f;
-	if(p01.x >= 0 && p01.x < imageWidth && p01.y >= 0 && p01.y < imageHeight) { uchar4 v01 = tex2D<uchar4>(colorTextureRef, p01.x, p01.y); if(v01.x != 0) { s1 += (1.0f-alpha)*make_float4(v01); w1 += (1.0f-alpha);} }
-	if(p11.x >= 0 && p11.x < imageWidth && p11.y >= 0 && p11.y < imageHeight) { uchar4 v11 = tex2D<uchar4>(colorTextureRef, p11.x, p11.y); if(v11.x != 0) { s1 +=		 alpha *make_float4(v11); w1 +=		 alpha ;} }
-
-	const float4 p0 = s0/w0;
-	const float4 p1 = s1/w1;
-
-	float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f;
-	if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); }
-	if(w1 > 0.0f) { ss +=		beta *p1; ww +=		  beta ; }
-
-	if(ww > 0.0f) {
-		ss /= ww;
-		return make_uchar4(ss.x,ss.y,ss.z,ss.w);
-	} else		  return make_uchar4(0, 0, 0, 0);
-}
-
 __device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) {
 	float x = c1.x-c2.x;
 	float y = c1.y-c2.y;
@@ -61,6 +31,11 @@ __device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) {
 	return x*x + y*y + z*z;
 }
 
+/*
+ * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005).
+ * Real-time foreground-background segmentation using codebook model.
+ * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004
+ */
 __device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) {
 	float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z;
 	float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z;
@@ -69,81 +44,20 @@ __device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) {
 	return sqrt(x_2 - p_2) < epsilon;
 }
 
-#define NUM_CUDA_BLOCKS		10000
-
-
-
-/*__global__ void integrateRegistrationKernel(HashData hashData, HashParams hashParams, DepthCameraParams cameraParams, cudaTextureObject_t depthT, cudaTextureObject_t colourT) {
-
-	// Stride over all allocated blocks
-	for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
-
-	//TODO check if we should load this in shared memory
-	HashEntry& entry = hashData.d_hashCompactified[bi];
-
-
-	int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(entry.pos);
-
-	uint i = threadIdx.x;	//inside of an SDF block
-	int3 pi = pi_base + make_int3(hashData.delinearizeVoxelIndex(i));
-	float3 pf = hashData.virtualVoxelPosToWorld(pi);
-
-	pf = hashParams.m_rigidTransformInverse * pf;
-	uint2 screenPos = make_uint2(cameraParams.cameraToKinectScreenInt(pf));
-
-	// For this voxel in hash, get its screen position and check it is on screen
-	if (screenPos.x < cameraParams.m_imageWidth && screenPos.y < cameraParams.m_imageHeight) {	//on screen
-
-		//float depth = g_InputDepth[screenPos];
-		float depth = tex2D<float>(depthT, screenPos.x, screenPos.y);
-		//if (depth > 20.0f) return;
-
-		uchar4 color  = make_uchar4(0, 0, 0, 0);
-		color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y);
-
-		// Depth is within accepted max distance from camera
-		if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check)
-			float depthZeroOne = cameraParams.cameraToKinectProjZ(depth);
-
-			// Calculate SDF of this voxel wrt the depth map value
-			float sdf = depth - pf.z;
-			float truncation = hashData.getTruncation(depth);
-
-			if (sdf > -truncation) {
-				float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f);
-
-				Voxel curr;	//construct current voxel
-				curr.sdf = sdf;
-				curr.weight = weightUpdate;
-				curr.color = make_uchar3(color.x, color.y, color.z);
-
-				uint idx = entry.ptr + i;
-				
-				Voxel out;
-				const Voxel &v1 = curr;
-				const Voxel &v0 = hashData.d_SDFBlocks[idx];
-
-				float redshift = (v0.weight > 0) ? 1.0f - ((v1.sdf - v0.sdf) / hashParams.m_truncation)*0.5f : 1.0f;
-
-				out.color.x = min(max(v1.color.x*redshift,0.0f),255.0f);
-				out.color.y = min(max(v1.color.y*redshift,0.0f),255.0f);
-				out.color.z = min(max(v1.color.z*(1.0f / redshift),0.0f),255.0f);
-
-				out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight);
-				out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight);
-
-				hashData.d_SDFBlocks[idx] = out;
-
-			}
-		}
-	}
-
-	}  // Stride loop
-}*/
-
-extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS];
+/*
+ * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007.
+ * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras
+ *     r = distance between points
+ *     h = smoothing parameter in meters (default 4cm)
+ */
+__device__ float spatialWeighting(float r) {
+	const float h = c_hashParams.m_spatialSmoothing;
+	if (r >= h) return 0.0f;
+	float rh = r / h;
+	rh = 1.0f - rh*rh;
+	return rh*rh*rh*rh;
+}
 
-#define WARP_SIZE 32
 
 __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParams, int numcams) {
 	__shared__ uint all_warp_ballot;
@@ -192,6 +106,10 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam
 			//printf("screen pos %d\n", color.x);
 			//return;
 
+			// TODO:(Nick) Accumulate weighted positions
+			// TODO:(Nick) Accumulate weighted normals
+			// TODO:(Nick) Accumulate weights
+
 			// Depth is within accepted max distance from camera
 			if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check)
 				//camdepths[count] = depth;
@@ -244,7 +162,8 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam
 
 	// Calculate voxel sign values across a warp
 	int warpNum = i / WARP_SIZE;
-	uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1);
+	//uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1);
+	uint ballot_result = __ballot_sync(0xFFFFFFFF, (fabs(oldVoxel.sdf) <= hashParams.m_virtualVoxelSize && oldVoxel.weight > 0) ? 1 : 0);
 
 	// Aggregate each warp result into voxel mask
 	if (i % WARP_SIZE == 0) {
@@ -254,9 +173,136 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam
 	__syncthreads();
 
 	// Work out if block is occupied or not and save voxel masks
+	// TODO:(Nick) Is it faster to do this in a separate garbage kernel?
+	if (i < 16) {
+		const uint v = voxels[i];
+		hashData.d_hashCompactified[bi]->voxels[i] = v;
+		const uint mask = 0x0000FFFF;
+		uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF);
+		uint b2 = __ballot_sync(mask, v == 0);
+		if (i == 0) {
+			if (b1 != mask && b2 != mask) hashData.d_hashCompactified[bi]->head.flags |= ftl::voxhash::kFlagSurface;
+			else hashData.d_hashCompactified[bi]->head.flags &= ~ftl::voxhash::kFlagSurface;
+		}
+	}
+
+	}
+}
+
+#define WINDOW_RADIUS 1
+#define PATCH_SIZE 32
+
+__global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int numcams) {
+	__shared__ uint voxels[16];
+
+	const uint i = threadIdx.x;	//inside of an SDF block
+	const int3 po = make_int3(hashData.delinearizeVoxelIndex(i));
+	const int warpNum = i / WARP_SIZE;
+	const int lane = i % WARP_SIZE;
+
+	// Stride over all allocated blocks
+	for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
+
+	//TODO check if we should load this in shared memory
+	//HashEntryHead entry = hashData.d_hashCompactified[bi]->head;
+
+	const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ));
+
+	//uint idx = entry.offset + i;
+	const int3 pi = pi_base + po;
+	const float3 pfb = hashData.virtualVoxelPosToWorld(pi);
+	//int count = 0;
+	//float camdepths[MAX_CAMERAS];
+
+	//Voxel oldVoxel; // = hashData.d_SDFBlocks[idx];
+	//hashData.deleteVoxel(oldVoxel);
+
+	//float3 awpos = make_float3(0.0f);
+	//float3 awnorm = make_float3(0.0f);
+	//float aweights = 0.0f;
+	float sdf = 0.0f;
+	float weights = 0.0f;
+
+	// Preload depth values
+	// 1. Find min and max screen positions
+	// 2. Subtract/Add WINDOW_RADIUS to min/max
+	// ... check that the buffer is not too small to cover this
+	// ... if buffer not big enough then don't buffer at all.
+	// 3. Populate shared mem depth map buffer using all threads
+	// 4. Adjust window lookups to use shared mem buffer
+
+	//uint cam=0;
+	for (uint cam=0; cam<numcams; ++cam) {
+		const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
+		const uint height = camera.params.m_imageHeight;
+		const uint width = camera.params.m_imageWidth;
+	
+		const float3 pf = camera.poseInverse * pfb;
+		const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+
+		//float3 wpos = make_float3(0.0f);
+		float3 wnorm = make_float3(0.0f);
+		
+
+		#pragma unroll
+		for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
+			for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
+				if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
+					float4 depth = tex2D<float4>(camera.points, screenPos.x+u, screenPos.y+v);
+					if (depth.z == MINF) continue;
+
+					//float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
+					const float3 camPos = camera.poseInverse * make_float3(depth); //camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+					const float weight = spatialWeighting(length(pf - camPos));
+
+					//wpos += weight*worldPos;
+					sdf += weight*(camPos.z - pf.z);
+					//sdf += camPos.z - pf.z;
+					//wnorm += weight*make_float3(normal);
+					//weights += 1.0f;	
+					weights += weight;			
+				}
+			}
+		}
+
+		//awpos += wpos;
+		//aweights += weights;
+	}
+
+	//awpos /= aweights;
+	//wnorm /= weights;
+
+	sdf /= weights;
+
+	//float sdf = (aweights == 0.0f) ? MINF : length(pfb - awpos);
+	//float sdf = wnorm.x * (pfb.x - wpos.x) + wnorm.y * (pfb.y - wpos.y) + wnorm.z * (pfb.z - wpos.z);
+
+	//printf("WEIGHTS: %f\n", weights);
+
+	//if (weights < 0.00001f) sdf = 0.0f;
+
+	// Calculate voxel sign values across a warp
+	int warpNum = i / WARP_SIZE;
+
+	//uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) < hashParams.m_virtualVoxelSize && aweights >= 0.5f) ? 1 : 0);
+	//uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) <= hashParams.m_virtualVoxelSize) ? 1 : 0);
+	//uint solid_ballot = __ballot_sync(0xFFFFFFFF, (aweights >= 0.0f) ? 1 : 0);
+	uint solid_ballot = __ballot_sync(0xFFFFFFFF, (sdf < 0.0f ) ? 1 : 0);
+
+	// Aggregate each warp result into voxel mask
+	if (i % WARP_SIZE == 0) {
+		voxels[warpNum] = solid_ballot;
+		//valid[warpNum] = valid_ballot;
+	}
+
+	__syncthreads();
+
+	// Work out if block is occupied or not and save voxel masks
+	// TODO:(Nick) Is it faster to do this in a separate garbage kernel?
 	if (i < 16) {
 		const uint v = voxels[i];
 		hashData.d_hashCompactified[bi]->voxels[i] = v;
+		//hashData.d_hashCompactified[bi]->validity[i] = valid[i];
 		const uint mask = 0x0000FFFF;
 		uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF);
 		uint b2 = __ballot_sync(mask, v == 0);
@@ -277,7 +323,7 @@ const dim3 gridSize(NUM_CUDA_BLOCKS, 1);
 const dim3 blockSize(threadsPerBlock, 1);
 
 //if (hashParams.m_numOccupiedBlocks > 0) {	//this guard is important if there is no depth in the current frame (i.e., no blocks were allocated)
-	integrateDepthMapsKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams);
+	integrateMLSKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams);
 //}
 
 //cudaSafeCall( cudaGetLastError() );
diff --git a/applications/reconstruct/src/scene_rep_hash_sdf.cu b/applications/reconstruct/src/scene_rep_hash_sdf.cu
index 5c5ebf6fdb73be59bf028d850ce4fa41e6986925..247247b6cc5279186f9f9bdc81bbe627ab0621ea 100644
--- a/applications/reconstruct/src/scene_rep_hash_sdf.cu
+++ b/applications/reconstruct/src/scene_rep_hash_sdf.cu
@@ -168,41 +168,6 @@ extern "C" void resetHashBucketMutexCUDA(HashData& hashData, const HashParams& h
 }
 
 
-__device__
-unsigned int linearizeChunkPos(const int3& chunkPos)
-{
-	int3 p = chunkPos-c_hashParams.m_streamingMinGridPos;
-	return  p.z * c_hashParams.m_streamingGridDimensions.x * c_hashParams.m_streamingGridDimensions.y +
-			p.y * c_hashParams.m_streamingGridDimensions.x +
-			p.x;
-}
-
-__device__
-int3 worldToChunks(const float3& posWorld)
-{
-	float3 p;
-	p.x = posWorld.x/c_hashParams.m_streamingVoxelExtents.x;
-	p.y = posWorld.y/c_hashParams.m_streamingVoxelExtents.y;
-	p.z = posWorld.z/c_hashParams.m_streamingVoxelExtents.z;
-
-	float3 s;
-	s.x = (float)sign(p.x);
-	s.y = (float)sign(p.y);
-	s.z = (float)sign(p.z);
-
-	return make_int3(p+s*0.5f);
-}
-
-__device__
-bool isSDFBlockStreamedOut(const int3& sdfBlock, const HashData& hashData, const unsigned int* d_bitMask)	//TODO MATTHIAS (-> move to HashData)
-{
-	float3 posWorld = hashData.virtualVoxelPosToWorld(hashData.SDFBlockToVirtualVoxelPos(sdfBlock)); // sdfBlock is assigned to chunk by the bottom right sample pos
-
-	uint index = linearizeChunkPos(worldToChunks(posWorld));
-	uint nBitsInT = 32;
-	return ((d_bitMask[index/nBitsInT] & (0x1 << (index%nBitsInT))) != 0x0);
-}
-
 // Note: bitMask used for Streaming out code... could be set to nullptr if not streaming out
 // Note: Allocations might need to be around fat rays since multiple voxels could correspond
 // to same depth map pixel at larger distances.
@@ -272,7 +237,7 @@ __global__ void allocKernel(HashData hashData, HashParams hashParams, int camnum
 		while(iter < g_MaxLoopIterCount) {
 
 			//check if it's in the frustum and not checked out
-			if (hashData.isSDFBlockInCameraFrustumApprox(hashParams, cameraParams, idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) {		
+			if (hashData.isInBoundingBox(hashParams, idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) {		
 				hashData.allocBlock(idCurrentVoxel);
 				//printf("Allocate block: %d\n",idCurrentVoxel.x);
 			}
diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp
index a9d5e57dc712829b592f56e8226abe0f563dc665..51d30d638223185f417cb049996b32650b06b151 100644
--- a/applications/reconstruct/src/splat_render.cpp
+++ b/applications/reconstruct/src/splat_render.cpp
@@ -1,6 +1,7 @@
 #include "splat_render.hpp"
 #include "splat_render_cuda.hpp"
 #include "compactors.hpp"
+#include "depth_camera_cuda.hpp"
 
 using ftl::render::Splatter;
 
@@ -20,7 +21,7 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	cudaSafeCall(cudaSetDevice(scene_->getCUDADevice()));
 
 	if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) {
-		depth1_ = ftl::cuda::TextureObject<uint>(camera.width, camera.height);
+		depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height);
 	}
 	if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) {
 		colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height);
@@ -47,11 +48,25 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 	params.camera.m_sensorDepthWorldMax = camera.maxDepth;
 	params.camera.m_sensorDepthWorldMin = camera.minDepth;
 
-	ftl::cuda::compactifyAllocated(scene_->getHashData(), scene_->getHashParams(), stream);
-	LOG(INFO) << "Occupied: " << scene_->getOccupiedCount();
-	ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream);
-	ftl::cuda::splat_points(depth1_, depth2_, params, stream);
-	ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+	//ftl::cuda::compactifyAllocated(scene_->getHashData(), scene_->getHashParams(), stream);
+	//LOG(INFO) << "Occupied: " << scene_->getOccupiedCount();
+
+	if (scene_->value("voxels", false)) {
+		ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream);
+		ftl::cuda::splat_points(depth1_, depth2_, params, stream);
+		ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream);
+	} else {
+		//ftl::cuda::clear_colour(colour1_, stream);
+		ftl::cuda::clear_depth(depth1_, stream);
+		ftl::cuda::clear_depth(depth2_, stream);
+		ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream);
+		//ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream);
+		ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
+		//ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream);
+	}
+
+	//ftl::cuda::median_filter(depth1_, depth2_, stream);
+	//ftl::cuda::splat_points(depth1_, depth2_, params, stream);
 
 	// TODO: Second pass
 
diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/splat_render.cu
index 0e3598f129bf40c72294ab19501b1fd3c95bb1e9..3addf108dcf89ea8f3069e3ba0daea59e67ae918 100644
--- a/applications/reconstruct/src/splat_render.cu
+++ b/applications/reconstruct/src/splat_render.cu
@@ -13,7 +13,7 @@
 using ftl::cuda::TextureObject;
 using ftl::render::SplatParams;
 
-__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth) {
+__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<int> depth) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
@@ -76,9 +76,65 @@ __device__ inline bool getVoxel(uint *voxels, int ix) {
 	return voxels[ix/32] & (0x1 << (ix % 32));
 }
 
-__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) {
+__global__ void occupied_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) {
+	__shared__ uint voxels[16];
+	__shared__ ftl::voxhash::HashEntryHead block;
+
+	// Stride over all allocated blocks
+	for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
+	__syncthreads();
+
+	const uint i = threadIdx.x;	//inside of an SDF block
+
+	if (i == 0) block = hashData.d_hashCompactified[bi]->head;
+	if (i < 16) {
+		voxels[i] = hashData.d_hashCompactified[bi]->voxels[i];
+		//valid[i] = hashData.d_hashCompactified[bi]->validity[i];
+	}
+
+	// Make sure all hash entries are cached
+	__syncthreads();
+
+	const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ));
+	const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i));
+	const int3 pi = pi_base + vp;
+	const float3 worldPos = hashData.virtualVoxelPosToWorld(pi);
+
+	const bool v = getVoxel(voxels, i);
+
+	uchar4 color = make_uchar4(255,0,0,255);
+	bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2);
+
+
+	// Only for surface voxels, work out screen coordinates
+	if (!is_surface) continue;
+
+	// TODO: For each original camera, render a new depth map
+
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf)); //  + make_float2(0.5f, 0.5f)
+
+	//printf("Worldpos: %f,%f,%f\n", camPos.x, camPos.y, camPos.z);
+
+	if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
+
+	const unsigned int x = screenPos.x;
+	const unsigned int y = screenPos.y;
+	const int idepth = static_cast<int>(camPos.z * 1000.0f);
+
+	// See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds
+	if (x < depth.width() && y < depth.height()) {
+		atomicMin(&depth(x,y), idepth);
+	}
+
+	}  // Stride
+}
+
+__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) {
 	// TODO:(Nick) Reduce bank conflicts by aligning these
 	__shared__ uint voxels[16];
+	//__shared__ uint valid[16];
 	__shared__ ftl::voxhash::HashEntryHead block;
 
 	// Stride over all allocated blocks
@@ -88,7 +144,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture
 	const uint i = threadIdx.x;	//inside of an SDF block
 
 	if (i == 0) block = hashData.d_hashCompactified[bi]->head;
-	if (i < 16) voxels[i] = hashData.d_hashCompactified[bi]->voxels[i];
+	if (i < 16) {
+		voxels[i] = hashData.d_hashCompactified[bi]->voxels[i];
+		//valid[i] = hashData.d_hashCompactified[bi]->validity[i];
+	}
 
 	// Make sure all hash entries are cached
 	__syncthreads();
@@ -115,10 +174,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture
 	int edgeZ = (vp.z == 0 ) ? 1 : 0;
 
 	uchar4 color = make_uchar4(255,0,0,255);
-	bool is_surface = false; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2);
+	bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2);
 	//if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255);
 
-	if (!is_surface && v) continue;
+	if (v) continue;  // !getVoxel(valid, i)
 
 	//if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0);
 
@@ -136,7 +195,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture
 				//if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue;
 
 				const bool vox = getVoxel(voxels, hashData.linearizeVoxelPos(uvi));
-				if (vox) {
+				if (vox) { //getVoxel(valid, hashData.linearizeVoxelPos(uvi))) {
 					is_surface = true;
 					// Should break but is slower?
 				}
@@ -174,7 +233,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture
 }
 
 void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
-			const TextureObject<uint> &depth,
+			const TextureObject<int> &depth,
 			const SplatParams &params, cudaStream_t stream) {
 
 	const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
@@ -182,16 +241,21 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
 
 	clearDepthKernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(hashData, depth);
 
-	//cudaSafeCall( cudaDeviceSynchronize() );
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 
 	const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;
 	const dim3 gridSize(NUM_CUDA_BLOCKS, 1);
 	const dim3 blockSize(threadsPerBlock, 1);
 
-	isosurface_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params);
+	occupied_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params);
 
 	cudaSafeCall( cudaGetLastError() );
-	//cudaSafeCall( cudaDeviceSynchronize() );
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 }
 
 // ---- Pass 2: Expand the point splats ----------------------------------------
@@ -199,7 +263,7 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
 #define SPLAT_RADIUS 7
 #define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1)
 #define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS)
-#define MAX_VALID 8
+#define MAX_VALID 100
 
 __device__ float distance2(float3 a, float3 b) {
 	const float x = a.x-b.x;
@@ -209,7 +273,7 @@ __device__ float distance2(float3 a, float3 b) {
 }
 
 __global__ void splatting_kernel(
-		TextureObject<uint> depth_in,
+		TextureObject<int> depth_in,
 		TextureObject<float> depth_out, SplatParams params) {
 	// Read an NxN region and
 	// - interpolate a depth value for this pixel
@@ -240,7 +304,7 @@ __global__ void splatting_kernel(
 
 	__syncthreads();
 
-	if (x >= depth_in.width() && y >= depth_in.height()) return;
+	if (x >= depth_in.width() || y >= depth_in.height()) return;
 
 	const float voxelSquared = params.voxelSize*params.voxelSize;
 	float mindepth = 1000.0f;
@@ -266,7 +330,7 @@ __global__ void splatting_kernel(
 			if (dist < voxelSquared) {
 				// Valid so check for minimum
 				//validPos[validix] = pos;
-				validIndices[validix++] = idx;
+				//validIndices[validix++] = idx;
 				if (d < mindepth) {
 					mindepth = d;
 					minidx = idx;
@@ -287,8 +351,8 @@ __global__ void splatting_kernel(
 	float contrib = 0.0f;
 	float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth);  // TODO:(Nick) Mindepth assumption is poor choice.
 
-	for (int j=0; j<validix; ++j) {
-		const int idx = validIndices[j];
+	//for (int j=0; j<validix; ++j) {
+		const int idx = minidx; //validIndices[j];
 		float3 posp = positions[idx];
 		//float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z);
 		float3 delta = (posp - pos) / 2*params.voxelSize;
@@ -308,7 +372,7 @@ __global__ void splatting_kernel(
 			contrib += c;
 			depth += posp.z * c;
 		}
-	}
+	//}
 
 	// Normalise
 	//colour.x /= contrib;
@@ -320,7 +384,7 @@ __global__ void splatting_kernel(
 	//colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255);
 }
 
-void ftl::cuda::splat_points(const TextureObject<uint> &depth_in,
+void ftl::cuda::splat_points(const TextureObject<int> &depth_in,
 		const TextureObject<float> &depth_out, const SplatParams &params, cudaStream_t stream) 
 {
 
@@ -329,4 +393,8 @@ void ftl::cuda::splat_points(const TextureObject<uint> &depth_in,
 
 	splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params);
 	cudaSafeCall( cudaGetLastError() );
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
 }
diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp
index a36e80ad833f258eced3acd63560e6b315995391..7737bb5d903d725a55b81d75812b99b52e7ace66 100644
--- a/applications/reconstruct/src/splat_render.hpp
+++ b/applications/reconstruct/src/splat_render.hpp
@@ -32,7 +32,7 @@ class Splatter {
 
 	private:
 	int device_;
-	ftl::cuda::TextureObject<uint> depth1_;
+	ftl::cuda::TextureObject<int> depth1_;
 	ftl::cuda::TextureObject<uchar4> colour1_;
 	ftl::cuda::TextureObject<float> depth2_;
 	ftl::cuda::TextureObject<uchar4> colour2_;
diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp
index deb702e1661ca8f36fcea7aa6b4b6a115439cc7b..d8fd9cf55649280c20805c8d4f8b9f7b758b2223 100644
--- a/applications/reconstruct/src/splat_render_cuda.hpp
+++ b/applications/reconstruct/src/splat_render_cuda.hpp
@@ -15,7 +15,7 @@ namespace cuda {
  * of objects up to at most truncation depth.
  */
 void isosurface_point_image(const ftl::voxhash::HashData& hashData,
-			const ftl::cuda::TextureObject<uint> &depth,
+			const ftl::cuda::TextureObject<int> &depth,
 			const ftl::render::SplatParams &params, cudaStream_t stream);
 
 //void isosurface_point_image_stereo(const ftl::voxhash::HashData& hashData,
@@ -25,14 +25,17 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData,
 
 // TODO: isosurface_point_cloud
 
-void splat_points(const ftl::cuda::TextureObject<uint> &depth_in,
+void splat_points(const ftl::cuda::TextureObject<int> &depth_in,
 		const ftl::cuda::TextureObject<float> &depth_out,
 		const ftl::render::SplatParams &params, cudaStream_t stream);
 
-void dibr(const ftl::cuda::TextureObject<float> &depth_in,
+void dibr(const ftl::cuda::TextureObject<int> &depth_out,
 		const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams,
 		const ftl::render::SplatParams &params, cudaStream_t stream);
 
+void dibr(const ftl::cuda::TextureObject<float> &depth_out,
+    const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams, const ftl::render::SplatParams &params, cudaStream_t stream);
+
 }
 }
 
diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp
index 51f9bc75b88b0d8ae73d41d8ba05fb248cb5ae00..e82da537bbf2df6789467f561d24492f04556ca3 100644
--- a/applications/reconstruct/src/voxel_scene.cpp
+++ b/applications/reconstruct/src/voxel_scene.cpp
@@ -2,6 +2,7 @@
 #include "compactors.hpp"
 #include "garbage.hpp"
 #include "integrators.hpp"
+#include "depth_camera_cuda.hpp"
 
 #include <opencv2/core/cuda_stream_accessor.hpp>
 
@@ -88,6 +89,8 @@ bool SceneRep::_initCUDA() {
 	// TODO:(Nick) Check memory is sufficient
 	// TODO:(Nick) Find out what our compute capability should be.
 
+	LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor;
+
 	return true;
 }
 
@@ -138,7 +141,8 @@ int SceneRep::upload() {
 				cam.params.m_imageHeight = in->parameters().height;
 				cam.params.m_sensorDepthWorldMax = in->parameters().maxDepth;
 				cam.params.m_sensorDepthWorldMin = in->parameters().minDepth;
-				cam.gpu.alloc(cam.params);
+				cam.gpu.alloc(cam.params, true);
+				LOG(INFO) << "GPU Allocated camera " << i;
 			}
 		}
 
@@ -163,8 +167,9 @@ int SceneRep::upload() {
 		if (depth.cols == 0) continue;
 
 		// Must be in RGBA for GPU
-		Mat rgba;
-		cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA);
+		Mat rgbt, rgba;
+		cv::cvtColor(rgb,rgbt, cv::COLOR_BGR2Lab);
+		cv::cvtColor(rgbt,rgba, cv::COLOR_BGR2BGRA);
 
 		// Send to GPU and merge view into scene
 		//cam.gpu.updateParams(cam.params);
@@ -178,7 +183,9 @@ int SceneRep::upload() {
 		//if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream)));
 
 		//allocate all hash blocks which are corresponding to depth map entries
-		_alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
+		if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
+
+		// Calculate normals
 	}
 
 	// Must have finished all allocations and rendering before next integration
@@ -211,7 +218,7 @@ void SceneRep::integrate() {
 
 void SceneRep::garbage() {
 	//_compactifyAllocated();
-	_garbageCollect();
+	if (value("voxels", false)) _garbageCollect();
 
 	//cudaSafeCall(cudaStreamSynchronize(integ_stream_));
 }
@@ -237,21 +244,11 @@ void SceneRep::garbage() {
 	m_numIntegratedFrames++;
 }*/
 
-void SceneRep::setLastRigidTransform(const Eigen::Matrix4f& lastRigidTransform) {
-	m_hashParams.m_rigidTransform = MatrixConversion::toCUDA(lastRigidTransform);
-	m_hashParams.m_rigidTransformInverse = MatrixConversion::toCUDA(lastRigidTransform.inverse()); //m_hashParams.m_rigidTransform.getInverse();
-}
-
 /*void SceneRep::setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) {
 	setLastRigidTransform(lastRigidTransform);
 	_compactifyHashEntries();
 }*/
 
-
-const Eigen::Matrix4f SceneRep::getLastRigidTransform() const {
-	return MatrixConversion::toEigen(m_hashParams.m_rigidTransform);
-}
-
 /* Nick: To reduce weights between frames */
 void SceneRep::nextFrame() {
 	if (do_reset_) {
@@ -260,7 +257,7 @@ void SceneRep::nextFrame() {
 		_create(_parametersFromConfig());
 	} else {
 		//ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_);
-		if (reg_mode_) ftl::cuda::clearVoxels(m_hashData, m_hashParams); 
+		//if (reg_mode_) ftl::cuda::clearVoxels(m_hashData, m_hashParams); 
 		//else ftl::cuda::starveVoxels(m_hashData, m_hashParams, integ_stream_);
 		m_numIntegratedFrames = 0;
 	}
@@ -269,10 +266,6 @@ void SceneRep::nextFrame() {
 //! resets the hash to the initial state (i.e., clears all data)
 void SceneRep::reset() {
 	m_numIntegratedFrames = 0;
-
-	//m_hashParams.m_rigidTransform.setIdentity();
-	//m_hashParams.m_rigidTransformInverse.setIdentity();
-	m_hashParams.m_numOccupiedBlocks = 0;
 	m_hashData.updateParams(m_hashParams);
 	resetCUDA(m_hashData, m_hashParams);
 }
@@ -288,22 +281,27 @@ HashParams SceneRep::_parametersFromConfig() {
 	HashParams params;
 	// First camera view is set to identity pose to be at the centre of
 	// the virtual coordinate space.
-	params.m_rigidTransform.setIdentity();
-	params.m_rigidTransformInverse.setIdentity();
 	params.m_hashNumBuckets = value("hashNumBuckets", 100000);
-	params.m_SDFBlockSize = SDF_BLOCK_SIZE;
-	params.m_numSDFBlocks = value("hashNumSDFBlocks",500000);
 	params.m_virtualVoxelSize = value("SDFVoxelSize", 0.006f);
 	params.m_maxIntegrationDistance = value("SDFMaxIntegrationDistance", 10.0f);
 	params.m_truncation = value("SDFTruncation", 0.1f);
 	params.m_truncScale = value("SDFTruncationScale", 0.01f);
 	params.m_integrationWeightSample = value("SDFIntegrationWeightSample", 10);
 	params.m_integrationWeightMax = value("SDFIntegrationWeightMax", 255);
-	// Note (Nick): We are not streaming voxels in/out of GPU
-	//params.m_streamingVoxelExtents = MatrixConversion::toCUDA(gas.s_streamingVoxelExtents);
-	//params.m_streamingGridDimensions = MatrixConversion::toCUDA(gas.s_streamingGridDimensions);
-	//params.m_streamingMinGridPos = MatrixConversion::toCUDA(gas.s_streamingMinGridPos);
-	//params.m_streamingInitialChunkListSize = gas.s_streamingInitialChunkListSize;
+	params.m_spatialSmoothing = value("spatialSmoothing", 0.04f); // 4cm
+	params.m_colourSmoothing = value("colourSmoothing", 20.0f);
+	params.m_confidenceThresh = value("confidenceThreshold", 20.0f);
+	params.m_flags = 0;
+	params.m_flags |= (value("clipping", false)) ? ftl::voxhash::kFlagClipping : 0;
+	params.m_flags |= (value("mls", false)) ? ftl::voxhash::kFlagMLS : 0;
+	params.m_maxBounds = make_int3(
+		value("bbox_x_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE),
+		value("bbox_y_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE),
+		value("bbox_z_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE));
+	params.m_minBounds = make_int3(
+		value("bbox_x_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE),
+		value("bbox_y_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE),
+		value("bbox_z_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE));
 	return params;
 }
 
@@ -362,8 +360,20 @@ void SceneRep::_compactifyAllocated() {
 	else ftl::cuda::integrateRegistration(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_);
 }*/
 
+extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height);
+
 void SceneRep::_integrateDepthMaps() {
-	ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_);
+	//cudaSafeCall(cudaDeviceSynchronize());
+
+	for (size_t i=0; i<cameras_.size(); ++i) {
+		//ftl::cuda::clear_depth(*(cameras_[i].gpu.depth2_tex_), integ_stream_);
+		ftl::cuda::clear_points(*(cameras_[i].gpu.points_tex_), integ_stream_);
+		ftl::cuda::mls_smooth(*(cameras_[i].gpu.points_tex_), m_hashParams, cameras_.size(), i, integ_stream_);
+		//ftl::cuda::int_to_float(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), 1.0f / 1000.0f, integ_stream_);
+		//ftl::cuda::hole_fill(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), cameras_[i].params, integ_stream_);
+		//bilateralFilterFloatMap(cameras_[i].gpu.depth_tex_->devicePtr(), cameras_[i].gpu.depth3_tex_->devicePtr(), 3, 7, cameras_[i].gpu.depth_tex_->width(), cameras_[i].gpu.depth_tex_->height());
+	}
+	if (value("voxels", false)) ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_);
 }
 
 void SceneRep::_garbageCollect() {
diff --git a/components/common/cpp/CMakeLists.txt b/components/common/cpp/CMakeLists.txt
index 911b7d1d69225d55e84cf6ab2fd87e0d1fc81a79..60577c96040dae74e93ccfe6f8ee4ae0c7abc856 100644
--- a/components/common/cpp/CMakeLists.txt
+++ b/components/common/cpp/CMakeLists.txt
@@ -5,6 +5,7 @@ set(COMMONSRC
 	src/configurable.cpp
 	src/loguru.cpp
 	src/opencv_to_pcl.cpp
+	src/cuda_common.cpp
 )
 
 check_function_exists(uriParseSingleUriA HAVE_URIPARSESINGLE)
diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp
index 5a2d1de0823acea508747527fd839280d01fc710..58e72ceb35c11c6d79ab17e675ce5bf1573018c7 100644
--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -54,8 +54,11 @@ class TextureObject {
 
 	__host__ __device__ inline const T &operator()(int u, int v) const { return ptr_[u+v*pitch2_]; }
 	__host__ __device__ inline T &operator()(int u, int v) { return ptr_[u+v*pitch2_]; }
+
+	void upload(const cv::Mat &, cudaStream_t stream=0);
+	void download(cv::Mat &, cudaStream_t stream=0) const;
 	
-	void free() {
+	__host__ void free() {
 		if (texobj_ != 0) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
 		if (ptr_ && needsfree_) cudaFree(ptr_);
 		ptr_ = nullptr;
@@ -220,8 +223,40 @@ template <typename T>
 TextureObject<T>::~TextureObject() {
 	//if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
 	//if (needsfree_) cudaFree(ptr_);
+	free();
 }
 
+template <>
+void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream);
+
+template <>
+void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream);
+
+template <>
+void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream);
+
+template <>
+void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream);
+
+template <>
+void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream);
+
+
+template <>
+void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const;
+
+template <>
+void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const;
+
+template <>
+void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const;
+
+template <>
+void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const;
+
+template <>
+void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const;
+
 }
 }
 
diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..571d6816b63413163471ef9006ae1d28031511fd
--- /dev/null
+++ b/components/common/cpp/src/cuda_common.cpp
@@ -0,0 +1,59 @@
+#include <ftl/cuda_common.hpp>
+
+using ftl::cuda::TextureObject;
+
+template <>
+void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar4), m.rows, cudaMemcpyHostToDevice, stream));
+}
+
+template <>
+void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float), m.rows, cudaMemcpyHostToDevice, stream));
+}
+
+template <>
+void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float2), m.rows, cudaMemcpyHostToDevice, stream));
+}
+
+template <>
+void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float4), m.rows, cudaMemcpyHostToDevice, stream));
+}
+
+template <>
+void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar), m.rows, cudaMemcpyHostToDevice, stream));
+}
+
+
+template <>
+void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), CV_8UC4);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar4), m.rows, cudaMemcpyDeviceToHost, stream));
+}
+
+template <>
+void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), CV_32FC1);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float), m.rows, cudaMemcpyDeviceToHost, stream));
+}
+
+template <>
+void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), CV_32FC2);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float2), m.rows, cudaMemcpyDeviceToHost, stream));
+}
+
+template <>
+void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), CV_32FC4);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float4), m.rows, cudaMemcpyDeviceToHost, stream));
+}
+
+template <>
+void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), CV_8UC1);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar), m.rows, cudaMemcpyDeviceToHost, stream));
+}
diff --git a/components/rgbd-sources/src/source.cpp b/components/rgbd-sources/src/source.cpp
index 8747b60fbed095c9c09d264b76490d848a7f758d..48244edafa48b7836237d4c31663e315e0e0d096 100644
--- a/components/rgbd-sources/src/source.cpp
+++ b/components/rgbd-sources/src/source.cpp
@@ -264,11 +264,17 @@ void Source::writeFrames(const ftl::cuda::TextureObject<uchar4> &rgb, const ftl:
 void Source::writeFrames(const ftl::cuda::TextureObject<uchar4> &rgb, const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) {
 	if (!impl_) {
 		UNIQUE_LOCK(mutex_,lk);
-		rgb_.create(rgb.height(), rgb.width(), CV_8UC4);
-		cudaSafeCall(cudaMemcpy2DAsync(rgb_.data, rgb_.step, rgb.devicePtr(), rgb.pitch(), rgb_.cols * sizeof(uchar4), rgb_.rows, cudaMemcpyDeviceToHost, stream));
-		depth_.create(depth.height(), depth.width(), CV_32FC1);
-		cudaSafeCall(cudaMemcpy2DAsync(depth_.data, depth_.step, depth.devicePtr(), depth.pitch(), depth_.cols * sizeof(float), depth_.rows, cudaMemcpyDeviceToHost, stream));
+		rgb.download(rgb_, stream);
+		//rgb_.create(rgb.height(), rgb.width(), CV_8UC4);
+		//cudaSafeCall(cudaMemcpy2DAsync(rgb_.data, rgb_.step, rgb.devicePtr(), rgb.pitch(), rgb_.cols * sizeof(uchar4), rgb_.rows, cudaMemcpyDeviceToHost, stream));
+		depth.download(depth_, stream);
+		//depth_.create(depth.height(), depth.width(), CV_32FC1);
+		//cudaSafeCall(cudaMemcpy2DAsync(depth_.data, depth_.step, depth.devicePtr(), depth.pitch(), depth_.cols * sizeof(float), depth_.rows, cudaMemcpyDeviceToHost, stream));
+		
 		stream_ = stream;
+		cudaSafeCall(cudaStreamSynchronize(stream_));
+		cv::cvtColor(rgb_,rgb_, cv::COLOR_BGRA2BGR);
+		cv::cvtColor(rgb_,rgb_, cv::COLOR_Lab2BGR);
 	}
 }