diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu
index fbbeee9fe3636aaa801366287f4f0f6361f9b5b8..d183777c65447443540733a2ae8ee97a3e293b35 100644
--- a/components/renderers/cpp/src/triangle_render.cu
+++ b/components/renderers/cpp/src/triangle_render.cu
@@ -102,62 +102,79 @@ float getZAtCoordinate(const float3 &barycentricCoord, const float (&tri)[3]) {
 		+ barycentricCoord.z * tri[2]);
 }
 
-/*
- * Convert source screen position to output screen coordinates.
+/**
+ * Loop over rectangular region covering the triangle and test each pixel for
+ * being inside or outside (using bary centric coordinate method). If inside
+ * then atomically write to the depth map.
  */
- template <int A, int B>
- __global__ void triangle_render_1_kernel(
-        TextureObject<float> depth_in,
-        TextureObject<int> depth_out,
-		TextureObject<short2> screen, SplatParams params) {
-	const int x = blockIdx.x*blockDim.x + threadIdx.x;
-	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const SplatParams &params, TextureObject<int> &depth_out) {
+	const int minX = min(v[0].x, min(v[1].x, v[2].x));
+	const int minY = min(v[0].y, min(v[1].y, v[2].y));
+	const int maxX = max(v[0].x, max(v[1].x, v[2].x));
+	const int maxY = max(v[0].y, max(v[1].y, v[2].y));
 
-    if (x < 1 || x >= depth_in.width()-1 || y < 1 || y >= depth_in.height()-1) return;
+	// Remove really large triangles
+	if ((maxX - minX) * (maxY - minY) <= params.triangle_limit) {
+		// TODO: Verify that < is correct, was <= before but < is faster.
+		for (int sy=minY; sy < maxY; ++sy) {
+			for (int sx=minX; sx < maxX; ++sx) {
+				//if () continue;
+
+				float3 baryCentricCoordinate = calculateBarycentricCoordinate(v, make_short2(sx, sy));
+
+				if (sx < params.camera.width && sx >= 0 && sy < params.camera.height && sy >= 0 && isBarycentricCoordInBounds(baryCentricCoordinate)) {
+					float new_depth = getZAtCoordinate(baryCentricCoordinate, d);
+					atomicMin(&depth_out(sx,sy), int(new_depth*100000.0f));
+				}
+			}
+		}
+	}
+}
 
-    float d[3];
-    d[0] = depth_in.tex2D(x,y);
+/**
+ * Depth differences above threshold are used to determine a discontinuity and
+ * hence that a triangle should not be draw between said verticies.
+ * TODO: Use discontinuity mask or some better test here.
+ */
+__device__ inline bool isValidTriangle(const float (&d)[3], const SplatParams &params) {
+	return !(fabs(d[0] - d[1]) > params.depthThreshold || fabs(d[0] - d[2]) > params.depthThreshold || d[0] < params.camera.minDepth || d[0] > params.camera.maxDepth);
+}
+
+/**
+ * Read the other two verticies into memory. The template parameters determine
+ * which verticies to load.
+ */
+template <int A, int B>
+__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const SplatParams &params, const TextureObject<float> &depth_in, const TextureObject<short2> &screen) {
     d[1] = depth_in.tex2D(x+A,y);
     d[2] = depth_in.tex2D(x,y+B);
-
-    // Is this triangle valid
-	if (fabs(d[0] - d[1]) > params.depthThreshold || fabs(d[0] - d[2]) > params.depthThreshold) return;
-	if (d[0] < params.camera.minDepth || d[0] > params.camera.maxDepth) return;
-
-    short2 v[3];
-    v[0] = screen.tex2D(x,y);
     v[1] = screen.tex2D(x+A,y);
 	v[2] = screen.tex2D(x,y+B);
+	return isValidTriangle(d, params);
+}
 
-	// Attempt to back face cull, but not great
-	//if ((v[1].x - v[0].x) * A < 0 || (v[2].y - v[0].y) * B < 0) return;
-
-	const int minX = min(v[0].x, min(v[1].x, v[2].x));
-	const int minY = min(v[0].y, min(v[1].y, v[2].y));
-	const int maxX = max(v[0].x, max(v[1].x, v[2].x));
-	const int maxY = max(v[0].y, max(v[1].y, v[2].y));
-
-	// Ensure the points themselves are drawn
-	//atomicMin(&depth_out(v[0].x,v[0].y), int(d[0]*100000.0f));
-	//atomicMin(&depth_out(v[1].x,v[1].y), int(d[1]*100000.0f));
-	//atomicMin(&depth_out(v[2].x,v[2].y), int(d[2]*100000.0f));
-
-	// Remove really large triangles
-	if ((maxX - minX) * (maxY - minY) > params.triangle_limit) return;
+/*
+ * Convert source screen position to output screen coordinates.
+ */
+ __global__ void triangle_render_kernel(
+        TextureObject<float> depth_in,
+        TextureObject<int> depth_out,
+		TextureObject<short2> screen, SplatParams params) {
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	// TODO: Verify that < is correct, was <= before but < is faster.
-	for (int sy=minY; sy < maxY; ++sy) {
-		for (int sx=minX; sx < maxX; ++sx) {
-			if (sx >= params.camera.width || sx < 0 || sy >= params.camera.height || sy < 0) continue;
+    if (x >= 1 && x < depth_in.width()-1 && y >= 1 && y < depth_in.height()-1) {
+		float d[3];
+		d[0] = depth_in.tex2D(x,y);
 
-			float3 baryCentricCoordinate = calculateBarycentricCoordinate(v, make_short2(sx, sy));
+		short2 v[3];
+		v[0] = screen.tex2D(x,y);
 
-			if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
-				float new_depth = getZAtCoordinate(baryCentricCoordinate, d);
-				atomicMin(&depth_out(sx,sy), int(new_depth*100000.0f));
-				//depth_out(sx,sy) = int(new_depth*100000.0f);
-			}
-		}
+		// Draw (optionally) 4 triangles as a diamond pattern around the central point.
+		if (loadTriangle<1,1>(x, y, d, v, params, depth_in, screen)) drawTriangle(d, v, params, depth_out);
+		if (loadTriangle<1,-1>(x, y, d, v, params, depth_in, screen)) drawTriangle(d, v, params, depth_out);
+		if (loadTriangle<-1,1>(x, y, d, v, params, depth_in, screen)) drawTriangle(d, v, params, depth_out);
+		if (loadTriangle<-1,-1>(x, y, d, v, params, depth_in, screen)) drawTriangle(d, v, params, depth_out);
 	}
 }
 
@@ -165,10 +182,7 @@ void ftl::cuda::triangle_render1(TextureObject<float> &depth_in, TextureObject<i
     const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	triangle_render_1_kernel<1,1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
-	triangle_render_1_kernel<1,-1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
-	triangle_render_1_kernel<-1,1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
-	triangle_render_1_kernel<-1,-1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
     cudaSafeCall( cudaGetLastError() );
 }