diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu
index 691ce7c5706ded55def668751300ebde5c44ab6b..08c73219df472ba526792f41ced599e1dd1e181c 100644
--- a/applications/reconstruct/src/dibr.cu
+++ b/applications/reconstruct/src/dibr.cu
@@ -383,12 +383,24 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp
 
 #define FULL_MASK 0xffffffff
 
-__device__ inline float warpMaxEnergy(float energy) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
-		const float other = __shfl_xor_sync(FULL_MASK, energy, i, WARP_SIZE);
-		energy = (energy > other) ? energy : other;
+__device__ inline float warpMax(float energy) {
+	//for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+	float e = energy;
+	for (int i = 1; i < 32; i *= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = max(e, other);
 	}
-	return energy;
+	return e;
+}
+
+__device__ inline float warpMin(float energy) {
+	//for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+	float e = energy;
+	for (int i = 1; i < 32; i *= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = min(e, other);
+	}
+	return e;
 }
 
 
@@ -404,34 +416,13 @@ __device__ inline float warpMaxEnergy(float energy) {
 	__shared__ int minimum[2*T_PER_BLOCK];
 	__shared__ int maximum[2*T_PER_BLOCK];
 	__shared__ unsigned int nidx[2*T_PER_BLOCK];
-	__shared__ float sampleenergy[2*T_PER_BLOCK][WARP_SIZE];
 
-	const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2;
+	const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int warp = tid / WARP_SIZE; //threadIdx.x / WARP_SIZE + threadIdx.y*2;
 	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	//const float3 camPos = params.camera.kinectDepthToSkeleton(x,y, float(point_in.tex2D(x,y)) / 1000.0f);
-
-    //const float r = 1.0f; //(camera.poseInverse * worldPos).z / camera.params.fx;
-
-	// Get virtual camera ray for splat centre and backface cull if possible
-	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
-	//const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin);
-	//if (dot(rayDir, normal) > 0.0f) return;
-
-    // Find the virtual screen position of current point
-	//const float3 camPos = params.m_viewMatrix * worldPos;
-	//if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
-	//if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
-	//const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
-
-	//const int upsample = 16; //min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));
-
-	// Not on screen so stop now...
-	//if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
-    //        screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return;
-	
-	const int lane = threadIdx.x % WARP_SIZE;
+	const int lane = tid % WARP_SIZE;
 	if (lane == 0) {
 		minimum[warp] = 100000000;
 		maximum[warp] = -100000000;
@@ -467,41 +458,31 @@ __device__ inline float warpMaxEnergy(float energy) {
 
 	if (minDepth >= params.camera.m_sensorDepthWorldMax) return;
 	if (maxDepth <= params.camera.m_sensorDepthWorldMin) return;
-	if (y == 200) printf("interval: %f\n", interval);
+	//if (y == 200) printf("interval: %f\n", maxDepth);
 
-	//const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
 
-	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
-	// TODO: Don't do this step, simply update the current pixel to either fill or replace existing value
-	// use warp threads to do the iteration samples ... 32 samples per pixel.
-	// could iterate each thread to perform more checks within likely range.
-	//for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
-	//	const float u = (i % upsample) - (upsample / 2);
-	//	const float v = (i / upsample) - (upsample / 2);
+	float maxenergy = 0.0f;
+	float bestdepth = 0.0f;
 
-        // Make an initial estimate of the points location
-		// Use minimum z as first estimate
-		float maxenergy = 0.0f;
-		float bestdepth = -1.0f;
-
-		// Search for best or threshold energy
-		for (int k=lane; k<MAX_ITERATIONS; k+=WARP_SIZE) {
-			const float3 nearest = params.camera.kinectDepthToSkeleton(x,y,minDepth+float(k)*interval);
-			const float myenergy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS_2>(neighborhood_cache[warp], nearest, nidx[warp], SPATIAL_SMOOTHING);
-			maxenergy = warpMaxEnergy(max(myenergy, maxenergy));
-			bestdepth = (myenergy == maxenergy) ? nearest.z : -1.0f;
-		}
+	// Search for best or threshold energy
+	for (int k=lane; k<MAX_ITERATIONS; k+=WARP_SIZE) {
+		const float3 nearest = params.camera.kinectDepthToSkeleton(x,y,minDepth+float(k)*interval);
+		const float myenergy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS_2>(neighborhood_cache[warp], nearest, nidx[warp], SPATIAL_SMOOTHING);
+		maxenergy = warpMax(max(myenergy, maxenergy));
+		bestdepth = (myenergy == maxenergy) ? nearest.z : 0.0f;
+	}
 
-		// Search for first energy maximum above a threshold
-		if (bestdepth > 0.0f && maxenergy >= 0.01f) {
-			const unsigned int cx = x;
-			const unsigned int cy = y;
-			if (bestdepth > params.camera.m_sensorDepthWorldMin && bestdepth < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
-				// Transform estimated point to virtual cam space and output z
-				atomicMin(&depth(cx,cy), bestdepth * 1000.0f);
-			}
+	// Search for first energy maximum above a threshold
+	if (bestdepth > 0.0f && maxenergy >= 0.1f) {
+		//printf("E D %f %f\n", maxenergy, bestdepth);
+		const unsigned int cx = x;
+		const unsigned int cy = y;
+		if (bestdepth > params.camera.m_sensorDepthWorldMin && bestdepth < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+			// Transform estimated point to virtual cam space and output z
+			//atomicMin(&depth(cx,cy), bestdepth * 1000.0f);
+			depth(cx,cy) = bestdepth * 1000.0f;
 		}
-	//}
+	}
 }
 
 // ===== Pass 2 and 3 : Attribute contributions ================================
diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp
index 3cf4e18ebe863908ffab4fb6c03888a1a5ff4d2d..039d4ab99a4de7add09797b1cf0b35d7775994dc 100644
--- a/applications/reconstruct/src/splat_render.cpp
+++ b/applications/reconstruct/src/splat_render.cpp
@@ -89,6 +89,7 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) {
 			//ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
 			if (src->value("splatting",  false)) {
 				//ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream);
+				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);
 				src->writeFrames(colour2_, depth2_, stream);
 			} else {
 				ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream);