diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu
index 98ce1bcc0cd7fee6e95a8d02f8e8735bf2142818..96fa70012adb59d992dda651018bef4239cca213 100644
--- a/components/renderers/cpp/src/triangle_render.cu
+++ b/components/renderers/cpp/src/triangle_render.cu
@@ -14,6 +14,7 @@ __device__ inline float length2(int dx, int dy) { return dx*dx + dy*dy; }
 /*
  * Convert source screen position to output screen coordinates.
  */
+ template <int A, int B>
  __global__ void triangle_render_1_kernel(
         TextureObject<float> depth_in,
         TextureObject<int> depth_out,
@@ -24,30 +25,33 @@ __device__ inline float length2(int dx, int dy) { return dx*dx + dy*dy; }
     if (x < 0 || x >= depth_in.width()-1 || y < 0 || y >= depth_in.height()-1) return;
 
     float d[3];
-    d[0] = depth_in.tex2D(x,y);
-    d[1] = depth_in.tex2D(x+1,y);
-    d[2] = depth_in.tex2D(x,y+1);
+    d[0] = depth_in.tex2D(x+A,y+B);
+    d[1] = depth_in.tex2D(x+(1-A),y+B);
+    d[2] = depth_in.tex2D(x+A,y+(1-B));
 
     // Is this triangle valid
     if (fabs(d[0] - d[1]) > 0.04f || fabs(d[0] - d[2]) > 0.04f) return;
 
     short2 s[3];
-    s[0] = screen.tex2D(x,y);
-    s[1] = screen.tex2D(x+1,y);
-    s[2] = screen.tex2D(x,y+1);
-
-    s[1].x -= s[0].x;
+    s[0] = screen.tex2D(x+A,y+B);
+    s[1] = screen.tex2D(x+(1-A),y+B);
+	s[2] = screen.tex2D(x+A,y+(1-B));
+	
+	const int dx = (A) ? -1 : 1;
+	const int dy = (B) ? -1 : 1;
+
+    s[1].x = (A) ? s[0].x - s[1].x : s[1].x - s[0].x;
     s[1].y -= s[0].y;
     s[2].x -= s[0].x;
-    s[2].y -= s[0].y;
+	s[2].y = (B) ? s[0].y - s[2].y : s[2].y - s[0].y;
 
     s[1].x = min(s[1].x,10);
-    s[2].y = min(s[2].y,10);
+	s[2].y = min(s[2].y,10);
 
     for (int sx=0; sx < s[1].x; ++sx) {
         for (int sy=0; sy < min(s[1].x - sx, s[2].y); ++sy) {
 			//if (sx > s[2].y-sy) continue;
-            if (sx+s[0].x >= params.camera.width || sy+s[0].y >= params.camera.height) continue;
+            if (dx*sx+s[0].x >= params.camera.width || dy*sy+s[0].y >= params.camera.height) continue;
 
             float dist1 = length2(sx,sy);
             float dist2 = length2(s[1].x-sx, s[1].y-sy);
@@ -55,7 +59,7 @@ __device__ inline float length2(int dx, int dy) { return dx*dx + dy*dy; }
 
             float new_depth = (d[0]*dist1 + d[1]*dist2 + d[2] * dist3) / (dist1+dist2+dist3);
 
-            atomicMin(&depth_out(sx+s[0].x,sy+s[0].y), int(new_depth*1000.0f));
+            atomicMin(&depth_out(dx*sx+s[0].x,dy*sy+s[0].y), int(new_depth*1000.0f));
         }
     }
 }
@@ -64,6 +68,9 @@ void ftl::cuda::triangle_render1(TextureObject<float> &depth_in, TextureObject<i
     const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
-	triangle_render_1_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_1_kernel<0,0><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_1_kernel<0,1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_1_kernel<1,0><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_1_kernel<1,1><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
     cudaSafeCall( cudaGetLastError() );
 }