diff --git a/components/renderers/cpp/include/ftl/cuda/normals.hpp b/components/renderers/cpp/include/ftl/cuda/normals.hpp
index 0e1b8a46fc59d9b177669f63d446d00bd3452149..da2247723206cc9a1167ffbb0bc659ec847208c2 100644
--- a/components/renderers/cpp/include/ftl/cuda/normals.hpp
+++ b/components/renderers/cpp/include/ftl/cuda/normals.hpp
@@ -34,6 +34,10 @@ void normal_filter(ftl::cuda::TextureObject<float4> &norm,
         const ftl::rgbd::Camera &camera, const float4x4 &pose,
         float thresh, cudaStream_t stream);
 
+void transform_normals(ftl::cuda::TextureObject<float4> &norm,
+        const float3x3 &pose,
+        cudaStream_t stream);
+
 }
 }
 
diff --git a/components/renderers/cpp/src/normals.cu b/components/renderers/cpp/src/normals.cu
index 13b82e3a09170b5223185a92fdc731ddc209ccdf..626015ce7ad9f9471079917b0678369bbb23f5cb 100644
--- a/components/renderers/cpp/src/normals.cu
+++ b/components/renderers/cpp/src/normals.cu
@@ -287,3 +287,33 @@ void ftl::cuda::normal_filter(ftl::cuda::TextureObject<float4> &norm,
     //cutilCheckMsg(__FUNCTION__);
     #endif
 }
+
+//==============================================================================
+
+__global__ void transform_normals_kernel(ftl::cuda::TextureObject<float4> norm,
+        float3x3 pose) {
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if(x >= norm.width() || y >= norm.height()) return;
+
+    float3 normal = pose * make_float3(norm.tex2D((int)x,(int)y));
+    normal /= length(normal);
+    norm(x,y) = make_float4(normal, 0.0f);
+}
+
+void ftl::cuda::transform_normals(ftl::cuda::TextureObject<float4> &norm,
+        const float3x3 &pose,
+        cudaStream_t stream) {
+
+    const dim3 gridSize((norm.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norm.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    transform_normals_kernel<<<gridSize, blockSize, 0, stream>>>(norm, pose);
+
+    cudaSafeCall( cudaGetLastError() );
+    #ifdef _DEBUG
+    cudaSafeCall(cudaDeviceSynchronize());
+    //cutilCheckMsg(__FUNCTION__);
+    #endif
+}
diff --git a/components/renderers/cpp/src/splat_render.cpp b/components/renderers/cpp/src/splat_render.cpp
index 813cbda8392ce36d8b9f70d1239087e55cbfd8f7..9b597e90e1af60d28b789f708392961b083e7436 100644
--- a/components/renderers/cpp/src/splat_render.cpp
+++ b/components/renderers/cpp/src/splat_render.cpp
@@ -231,6 +231,8 @@ void Splatter::_renderChannel(
 	// Generate initial normals for the splats
 	accum_.create<GpuMat>(Channel::Normals, Format<float4>(params_.camera.width, params_.camera.height));
 	_blendChannel(accum_, Channel::Normals, Channel::Normals, stream);
+	// Put normals in camera space here...
+	ftl::cuda::transform_normals(accum_.getTexture<float4>(Channel::Normals), params_.m_viewMatrix.getFloat3x3(), stream);
 
 	// Estimate point density
 	accum_.create<GpuMat>(Channel::Density, Format<float>(params_.camera.width, params_.camera.height));
diff --git a/components/renderers/cpp/src/splatter.cu b/components/renderers/cpp/src/splatter.cu
index 253072ca9e3e3aa9e467118563e9f004a636ad21..cd69098ea6f80c269f66deb935e652ffc33398c8 100644
--- a/components/renderers/cpp/src/splatter.cu
+++ b/components/renderers/cpp/src/splatter.cu
@@ -146,7 +146,7 @@ __device__ inline float make(float v) {
 /*
  * Pass 1b: Expand splats to full size and merge
  */
- template <int SEARCH_DIAMETER, typename T>
+ template <int SEARCH_RADIUS, typename T>
  __global__ void splat_kernel(
         //TextureObject<float4> points,       // Original 3D points
         TextureObject<float4> normals,
@@ -161,62 +161,63 @@ __device__ inline float make(float v) {
         
     //const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
 
-    const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+    //const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
     //const int warp = tid / WARP_SIZE;
-    const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+    const int x = blockIdx.x*blockDim.x + threadIdx.x;
     const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
     if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
 
-    const float3 origin = params.m_viewMatrixInverse * make_float3(0.0f);
+    //const float3 origin = params.m_viewMatrixInverse * make_float3(0.0f);
     float3 ray = params.camera.screenToCam(x,y,1.0f);
     ray = ray / length(ray);
     const float scale = ray.z;
-    ray = params.m_viewMatrixInverse.getFloat3x3() * ray;
+    //ray = params.m_viewMatrixInverse.getFloat3x3() * ray;
 
     //float depth = 0.0f;
     //float contrib = 0.0f;
     float depth = 1000.0f;
-    float pdepth = 1000.0f;
+    //float pdepth = 1000.0f;
 
     struct Result {
         float weight;
         float depth;
-        T attr;
     };
 
-    Result results[(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE];
+    Result results[2*SEARCH_RADIUS+1][2*SEARCH_RADIUS+1];
 
     // Each thread in warp takes an upsample point and updates corresponding depth buffer.
-    const int lane = tid % WARP_SIZE;
-    for (int i=lane; i<SEARCH_DIAMETER*SEARCH_DIAMETER; i+=WARP_SIZE) {
-        const float u = (i % SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
-        const float v = (i / SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
+    //const int lane = tid % WARP_SIZE;
+    //for (int i=lane; i<SEARCH_DIAMETER*SEARCH_DIAMETER; i+=WARP_SIZE) {
+    //    const float u = (i % SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
+    //    const float v = (i / SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
+    for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
+    for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
 
-        results[i/WARP_SIZE] = {0.0f, 0.0f, make<T>()};
+        results[v+SEARCH_RADIUS][u+SEARCH_RADIUS] = {0.0f, 1000.0f};
 
         // Use the depth buffer to determine this pixels 3D position in camera space
         const float d = ((float)depth_in.tex2D(x+u, y+v)/1000.0f);
 
+        const float3 n = make_float3(normals.tex2D((int)(x)+u, (int)(y)+v));
+        const float dens = density.tex2D((int)(x)+u, (int)(y)+v);
+
         if (d < params.camera.minDepth || d > params.camera.maxDepth) continue;
 
         const float3 camPos = params.camera.screenToCam((int)(x)+u,(int)(y)+v,d);
-        const float3 camPos2 = params.camera.screenToCam((int)(x),(int)(y),d);
-        const float3 worldPos = params.m_viewMatrixInverse * camPos;
+        //const float3 camPos2 = params.camera.screenToCam((int)(x),(int)(y),d);
+        //const float3 worldPos = params.m_viewMatrixInverse * camPos;
 
 
-        // Assumed to be normalised
-        float4 n = normals.tex2D((int)(x)+u, (int)(y)+v);
-        n /= length(n);
+        
 		//if (length(make_float3(n)) == 0.0f) printf("BAD NORMAL\n");
 
         // Does the ray intersect plane of splat?
         float t = 1000.0f;
-        const float r = ftl::cuda::intersectDistance(make_float3(n), worldPos, origin, ray, t);
-        if (r != PINF) { //} && fabs(t-camPos.z) < 0.01f) {
+        const float r = ftl::cuda::intersectDistance(n, camPos, make_float3(0.0f), ray, t);
+        //if (r != PINF) { //} && fabs(t-camPos.z) < 0.01f) {
             // Adjust from normalised ray back to original meters units
             t *= scale;
-            const float dens = density.tex2D((int)(x)+u, (int)(y)+v);
             float weight = ftl::cuda::weighting(r, dens/params.camera.fx); // (1.0f/params.camera.fx) / (t/params.camera.fx)
 
             /* Buehler C. et al. 2001. Unstructured Lumigraph Rendering. */
@@ -225,44 +226,45 @@ __device__ inline float make(float v) {
             // enough for our purposes if the alignment step prevents ghosting
             // TODO: Use depth and perhaps the neighbourhood consistency in:
             //     Kuster C. et al. 2011. FreeCam: A hybrid camera system for interactive free-viewpoint video
-            if (params.m_flags & ftl::render::kNormalWeightColours) weight *= n.w * n.w;
+            //if (params.m_flags & ftl::render::kNormalWeightColours) weight *= n.w * n.w;
             //if (params.m_flags & ftl::render::kDepthWeightColours) weight *= ???
 
             if (weight <= 0.0f) continue;
 
-            //depth = min(depth, t);
-            if (t < depth) {
-                pdepth = depth;
-                depth = t;
-            }
-            results[i/WARP_SIZE] = {weight, t, in.tex2D((int)x+u, (int)y+v)};
-        }
+            depth = min(depth, t);
+            results[v+SEARCH_RADIUS][u+SEARCH_RADIUS] = {weight, t};
+        //}
+    }
     }
 
-    depth = warpMin(depth);
-    pdepth = warpMin(pdepth);
+    //depth = warpMin(depth);
+    //pdepth = warpMin(pdepth);
 
     float adepth = 0.0f;
     float contrib = 0.0f;
     float4 attr = make_float4(0.0f);
 
     // Loop over results array
-    for (int i=0; i<(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE; ++i) {
-        if (results[i].depth - depth < 0.04f) {
-            adepth += results[i].depth * results[i].weight;
-            attr += make_float4(results[i].attr) * results[i].weight;
-            contrib += results[i].weight;
-        }
+    for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
+    for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
+        auto &result = results[v+SEARCH_RADIUS][u+SEARCH_RADIUS];
+        float s = ftl::cuda::weighting(fabs(result.depth - depth), 0.04f);
+        //if (result.depth - depth < 0.04f) {
+            adepth += result.depth * result.weight * s;
+            attr += make_float4(in.tex2D((int)x+u, (int)y+v)) * result.weight * s;
+            contrib += result.weight * s;
+        //}
+    }
     }
 
     // Sum all attributes and contributions
-    adepth = warpSum(adepth);
-    attr.x = warpSum(attr.x);
-    attr.y = warpSum(attr.y);
-    attr.z = warpSum(attr.z);
-    contrib = warpSum(contrib);
+    //adepth = warpSum(adepth);
+    //attr.x = warpSum(attr.x);
+    //attr.y = warpSum(attr.y);
+    //attr.z = warpSum(attr.z);
+    //contrib = warpSum(contrib);
 
-    if (lane == 0 && contrib > 0.0f) {
+    if (contrib > 0.0f) {
         depth_out(x,y) = adepth / contrib;
         out(x,y) = make<T>(attr / contrib);
     }
@@ -277,10 +279,10 @@ void ftl::cuda::splat(
         TextureObject<float> &depth_out,
         TextureObject<T> &colour_out,
         const SplatParams &params, cudaStream_t stream) {
-    const dim3 gridSize((depth_in.width() + 2 - 1)/2, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(2*WARP_SIZE, T_PER_BLOCK);
+            const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+            const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-    splat_kernel<8,T><<<gridSize, blockSize, 0, stream>>>(
+    splat_kernel<4,T><<<gridSize, blockSize, 0, stream>>>(
         normals,
         density,
         colour_in,