diff --git a/components/operators/include/ftl/cuda/fixed.hpp b/components/operators/include/ftl/cuda/fixed.hpp
index efb0c09caabd5e41f024679e8e80a2c6ea2b6df3..16e1e6ae0769fbfa16bad8931a58ac23ff43cb7b 100644
--- a/components/operators/include/ftl/cuda/fixed.hpp
+++ b/components/operators/include/ftl/cuda/fixed.hpp
@@ -3,7 +3,7 @@
 
 template <int FRAC>
 __device__ inline float fixed2float(short v) {
-    return v / (1 << FRAC);
+    return float(v) / float(1 << FRAC);
 }
 
 template <int FRAC>
diff --git a/components/renderers/cpp/include/ftl/render/CUDARender.hpp b/components/renderers/cpp/include/ftl/render/CUDARender.hpp
index a4e65bce6ba1aa40dbce3c4a8ba24747f258300a..3b933821450aed45c2c91aebf3ee8f84ed8755ac 100644
--- a/components/renderers/cpp/include/ftl/render/CUDARender.hpp
+++ b/components/renderers/cpp/include/ftl/render/CUDARender.hpp
@@ -59,8 +59,8 @@ class CUDARender : public ftl::render::FSRenderer {
 	ftl::cuda::TextureObject<int> contrib_;
 	//ftl::cuda::TextureObject<half4> normals_;
 
-	std::list<ftl::cuda::TextureObject<short2>*> screen_buffers_;
-	std::list<ftl::cuda::TextureObject<float>*> depth_buffers_;
+	std::list<cv::cuda::GpuMat*> screen_buffers_;
+	std::list<cv::cuda::GpuMat*> depth_buffers_;
 	ftl::cuda::TextureObject<float> depth_out_;
 
 	ftl::cuda::Collision *collisions_;
@@ -118,8 +118,8 @@ class CUDARender : public ftl::render::FSRenderer {
 	bool _alreadySeen() const { return last_frame_ == scene_->timestamp(); }
 	void _adjustDepthThresholds(const ftl::rgbd::Camera &fcam);
 
-	ftl::cuda::TextureObject<float> &_getDepthBuffer(const cv::Size &);
-	ftl::cuda::TextureObject<short2> &_getScreenBuffer(const cv::Size &);
+	cv::cuda::GpuMat &_getDepthBuffer(const cv::Size &);
+	cv::cuda::GpuMat &_getScreenBuffer(const cv::Size &);
 
 	inline ftl::codecs::Channel _getDepthChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Depth : ftl::codecs::Channel::Depth2; }
 	inline ftl::codecs::Channel _getNormalsChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Normals : ftl::codecs::Channel::Normals2; }
diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp
index c274e136fd5348ce7c982a26e51e151e99679aec..418ffd9132cec9e3013f8d9ca0934f9aafa7edbe 100644
--- a/components/renderers/cpp/src/CUDARender.cpp
+++ b/components/renderers/cpp/src/CUDARender.cpp
@@ -218,20 +218,20 @@ void CUDARender::_adjustDepthThresholds(const ftl::rgbd::Camera &fcam) {
 	params_.depthCoef = fcam.baseline*fcam.fx;
 }
 
-ftl::cuda::TextureObject<float> &CUDARender::_getDepthBuffer(const cv::Size &size) {
+cv::cuda::GpuMat &CUDARender::_getDepthBuffer(const cv::Size &size) {
 	for (auto *b : depth_buffers_) {
-		if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b;
+		if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b;
 	}
-	auto *nb = new ftl::cuda::TextureObject<float>(size.width, size.height);
+	auto *nb = new cv::cuda::GpuMat(size, CV_16S);
 	depth_buffers_.push_back(nb);
 	return *nb;
 }
 
-ftl::cuda::TextureObject<short2> &CUDARender::_getScreenBuffer(const cv::Size &size) {
+cv::cuda::GpuMat &CUDARender::_getScreenBuffer(const cv::Size &size) {
 	for (auto *b : screen_buffers_) {
-		if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b;
+		if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b;
 	}
-	auto *nb = new ftl::cuda::TextureObject<short2>(size.width, size.height);
+	auto *nb = new cv::cuda::GpuMat(size, CV_16SC2);
 	screen_buffers_.push_back(nb);
 	return *nb;
 }
@@ -284,14 +284,14 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		if (use_depth) {
 			if (f.hasChannel(Channel::Depth)) {
 				ftl::cuda::screen_coord(
-					f.createTexture<float>(Channel::Depth),
+					f.get<cv::cuda::GpuMat>(Channel::Depth),
 					depthbuffer,
 					screenbuffer,
 					params_, transform, f.getLeftCamera(), stream
 				);
 			} else if (f.hasChannel(Channel::GroundTruth)) {
 				ftl::cuda::screen_coord(
-					f.createTexture<float>(Channel::GroundTruth),
+					f.get<cv::cuda::GpuMat>(Channel::GroundTruth),
 					depthbuffer,
 					screenbuffer,
 					params_, transform, f.getLeftCamera(), stream
@@ -316,7 +316,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		// Decide on and render triangles around each point
 		ftl::cuda::triangle_render1(
 			depthbuffer,
-			temp_.createTexture<int>((do_blend) ? Channel::Depth : Channel::Depth2),
+			temp_.create<cv::cuda::GpuMat>((do_blend) ? Channel::Depth : Channel::Depth2),
 			screenbuffer,
 			params_, stream
 		);
@@ -357,7 +357,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 		);
 	} else {
 		//ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), out.createTexture<float>(_getDepthChannel()), 1.0f / 100000.0f, stream_);
-		ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_);
+		ftl::cuda::merge_convert_depth(temp_.createTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_);
 	}
 
 	// Now merge new render to any existing frameset render, detecting collisions
diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu
index 15473b9ad4781957b1b9f1c6949469db1c6d230c..1fda3275953bccf54be24cc4b68d7e046d30f6cf 100644
--- a/components/renderers/cpp/src/screen.cu
+++ b/components/renderers/cpp/src/screen.cu
@@ -2,6 +2,7 @@
 #include "splatter_cuda.hpp"
 #include <ftl/rgbd/camera.hpp>
 #include <ftl/cuda_common.hpp>
+#include <ftl/cuda/fixed.hpp>
 
 using ftl::rgbd::Camera;
 using ftl::cuda::TextureObject;
@@ -34,29 +35,23 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters
 	return make_uint2(params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos)));
 }
 
-/*template <>
-__device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters &params, const float3 &camPos) {
-	float2 pt =  params.camera.camToScreen<float2>(camPos); //params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos));
-	const float coeff = 1.0f / (params.viewport.warpMatrix.entries[6] * pt.x + params.viewport.warpMatrix.entries[7] * pt.y + params.viewport.warpMatrix.entries[8]);
-	const float xcoo = coeff * (params.viewport.warpMatrix.entries[0] * pt.x + params.viewport.warpMatrix.entries[1] * pt.y + params.viewport.warpMatrix.entries[2]);
-	const float ycoo = coeff * (params.viewport.warpMatrix.entries[3] * pt.x + params.viewport.warpMatrix.entries[4] * pt.y + params.viewport.warpMatrix.entries[5]);
-	return make_uint2(xcoo, ycoo);
-}*/
-
 /*
  * Convert source screen position to output screen coordinates.
  */
  template <ftl::render::ViewPortMode VPMODE, Projection PROJECT>
- __global__ void screen_coord_kernel(TextureObject<float> depth,
-        TextureObject<float> depth_out,
-		TextureObject<short2> screen_out, Parameters params, float4x4 pose, Camera camera) {
+ __global__ void screen_coord_kernel(
+	const float* __restrict__ depth,
+	short* __restrict__ depth_out,
+	short2* __restrict__ screen_out,
+    int pitch4, int pitch2, Parameters params, float4x4 pose, Camera camera)
+{
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	if (x >= 0 && y >= 0 && x < depth.width() && y < depth.height()) {
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
 		//uint2 screenPos = make_uint2(30000,30000);
 
-		const float d = depth.tex2D(x, y);
+		const float d = depth[y*pitch4+x];
 
 		// Find the virtual screen position of current point
 		const float3 camPos =  (d > camera.minDepth && d < camera.maxDepth) ? pose * camera.screenToCam(x,y,d) : make_float3(0.0f,0.0f,0.0f);
@@ -69,34 +64,42 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters
 				screenPos.x >= params.camera.width ||
 				screenPos.y >= params.camera.height)
 			screenPos = make_float3(30000,30000,0);
-		screen_out(x,y) = make_short2(screenPos.x, screenPos.y);
-		depth_out(x,y) = screenPos.z;
+
+		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
+		depth_out[y*pitch2+x] = float2fixed<10>(screenPos.z);
 	}
 }
 
-void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &depth_out,
-		TextureObject<short2> &screen_out, const Parameters &params,
+void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out, const Parameters &params,
 		const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
-    const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+    const dim3 gridSize((depth.cols + THREADS_X - 1)/THREADS_X, (depth.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+	
+	depth_out.create(depth.size(), CV_16S);
+	screen_out.create(depth.size(), CV_16SC2);
 
 	if (params.projection == Projection::PERSPECTIVE) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	} else if (params.projection == Projection::EQUIRECTANGULAR) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	} else if (params.projection == Projection::ORTHOGRAPHIC) {
 		switch (params.viewPortMode) {
-		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
-		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break;
+		case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
+		case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break;
 		}
 	}
 	cudaSafeCall( cudaGetLastError() );
@@ -109,12 +112,16 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &
  * Convert source screen position to output screen coordinates. Assumes a
  * constant depth of 1m instead of using a depth channel input.
  */
- __global__ void screen_coord_kernel(TextureObject<float> depth_out,
-		TextureObject<short2> screen_out, Camera vcamera, float4x4 pose, Camera camera) {
+ __global__ void screen_coord_kernel(
+	 	short* __restrict__ depth_out,
+		short2* __restrict__ screen_out,
+		int pitch4, int pitch2,
+		Camera vcamera, float4x4 pose, Camera camera) {
+
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) {
+	if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) {
 		//uint2 screenPos = make_uint2(30000,30000);
 		const float d = camera.maxDepth;
 
@@ -128,15 +135,21 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &
 				screenPos.y >= vcamera.height)
 			screenPos = make_uint2(30000,30000);
 
-		screen_out(x,y) = make_short2(screenPos.x, screenPos.y);
-		depth_out(x,y) = camPos.z;
+		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
+		depth_out[y*pitch2+x] = float2fixed<10>(camPos.z);
 	}
 }
 
-void ftl::cuda::screen_coord(TextureObject<float> &depth_out, TextureObject<short2> &screen_out, const Parameters &params, const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
-	const dim3 gridSize((screen_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+void ftl::cuda::screen_coord(cv::cuda::GpuMat &depth_out, cv::cuda::GpuMat &screen_out, const Parameters &params, const float4x4 &pose, const Camera &camera, cudaStream_t stream) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+    const dim3 gridSize((camera.width + THREADS_X - 1)/THREADS_X, (camera.height + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+
+	depth_out.create(camera.height, camera.width, CV_16S);
+	screen_out.create(camera.height, camera.width, CV_16SC2);
 
-	screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, screen_out, params.camera, pose, camera);
+	screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out.ptr<short>(), screen_out.ptr<short2>(), screen_out.step1()/2, depth_out.step1(), params.camera, pose, camera);
 	cudaSafeCall( cudaGetLastError() );
 }
diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp
index 147ccf99479674f26ab6b7b5d86ee493c18f9123..e8be62b96f28baa390823575ad8bcd3572a6a278 100644
--- a/components/renderers/cpp/src/splatter_cuda.hpp
+++ b/components/renderers/cpp/src/splatter_cuda.hpp
@@ -7,26 +7,26 @@
 namespace ftl {
 namespace cuda {
 	void screen_coord(
-		ftl::cuda::TextureObject<float> &depth,
-		ftl::cuda::TextureObject<float> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen_out,
+		const cv::cuda::GpuMat &depth,
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out,
 		const ftl::render::Parameters &params,
 		const float4x4 &pose,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);
 
 	void screen_coord(
-		ftl::cuda::TextureObject<float> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen_out,
+		cv::cuda::GpuMat &depth_out,
+		cv::cuda::GpuMat &screen_out,
 		const ftl::render::Parameters &params,
 		const float4x4 &pose,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);
 
 	void triangle_render1(
-		ftl::cuda::TextureObject<float> &depth_in,
-		ftl::cuda::TextureObject<int> &depth_out,
-		ftl::cuda::TextureObject<short2> &screen,
+		const cv::cuda::GpuMat &depth_in,  // short
+		cv::cuda::GpuMat &depth_out, // int
+		const cv::cuda::GpuMat &screen, // short2
 		const ftl::render::Parameters &params,
 		cudaStream_t stream);
 
diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu
index 3b45d1315d2ef3bc486fe97acc7f7a3a6f163e0c..40ee1250e2b05e90c8f4d8122022493925ced613 100644
--- a/components/renderers/cpp/src/triangle_render.cu
+++ b/components/renderers/cpp/src/triangle_render.cu
@@ -2,6 +2,7 @@
 #include "splatter_cuda.hpp"
 #include <ftl/rgbd/camera.hpp>
 #include <ftl/cuda_common.hpp>
+#include <ftl/cuda/fixed.hpp>
 
 #include <ftl/cuda/weighting.hpp>
 
@@ -107,7 +108,7 @@ float getZAtCoordinate(const float3 &barycentricCoord, const float (&tri)[3]) {
  * being inside or outside (using bary centric coordinate method). If inside
  * then atomically write to the depth map.
  */
-__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters &params, TextureObject<int> &depth_out) {
+__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters &params, int* depth_out, int out_pitch4) {
 	const int minX = min(v[0].x, min(v[1].x, v[2].x));
 	const int minY = min(v[0].y, min(v[1].y, v[2].y));
 	const int maxX = max(v[0].x, max(v[1].x, v[2].x));
@@ -124,7 +125,7 @@ __device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Pa
 
 				if (sx < params.camera.width && sx >= 0 && sy < params.camera.height && sy >= 0 && isBarycentricCoordInBounds(baryCentricCoordinate)) {
 					float new_depth = getZAtCoordinate(baryCentricCoordinate, d);
-					atomicMin(&depth_out(sx,sy), int(new_depth*100000.0f));
+					atomicMin(&depth_out[sx+sy*out_pitch4], int(new_depth*100000.0f));
 				}
 			}
 		}
@@ -145,11 +146,11 @@ __device__ inline bool isValidTriangle(const short2 (&v)[3]) {
  * which verticies to load.
  */
 template <int A, int B>
-__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const TextureObject<float> &depth_in, const TextureObject<short2> &screen) {
-    d[1] = depth_in.tex2D(x+A,y);
-    d[2] = depth_in.tex2D(x,y+B);
-    v[1] = screen.tex2D(x+A,y);
-	v[2] = screen.tex2D(x,y+B);
+__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const short* __restrict__ depth_in, const short2* __restrict__ screen, int pitch4, int pitch2) {
+    d[1] = fixed2float<10>(depth_in[y*pitch2+x+A]);
+    d[2] = fixed2float<10>(depth_in[(y+B)*pitch2+x]);
+    v[1] = screen[y*pitch4+x+A];
+	v[2] = screen[(y+B)*pitch4+x];
 	return isValidTriangle(v);
 }
 
@@ -157,37 +158,44 @@ __device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const
  * Convert source screen position to output screen coordinates.
  */
  __global__ void triangle_render_kernel(
-        TextureObject<float> depth_in,
-        TextureObject<int> depth_out,
-		TextureObject<short2> screen, Parameters params) {
+        const short* __restrict__ depth_in,
+        int* depth_out,
+		const short2* __restrict__ screen,
+		int width, int height, int pitch2, int pitch4, int out_pitch4, Parameters params) {
+
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if (x >= 1 && x < depth_in.width()-1 && y >= 1 && y < depth_in.height()-1) {
+    if (x >= 1 && x < width-1 && y >= 1 && y < height-1) {
 		float d[3];
-		d[0] = depth_in.tex2D(x,y);
+		d[0] = fixed2float<10>(depth_in[y*pitch2+x]);
 
 		short2 v[3];
-		v[0] = screen.tex2D(x,y);
+		v[0] = screen[y*pitch4+x];
 
 		if (v[0].x < 30000) {
 			// Calculate discontinuity threshold.
 			//const float threshold = (params.depthCoef / ((params.depthCoef / d[0]) - params.disconDisparities)) - d[0];
 
 			// Draw (optionally) 4 triangles as a diamond pattern around the central point.
-			if (loadTriangle<1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
-			if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out);
+			if (loadTriangle<1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
+			if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4);
 		}
 	}
 }
 
-void ftl::cuda::triangle_render1(TextureObject<float> &depth_in, TextureObject<int> &depth_out, TextureObject<short2> &screen, const Parameters &params, cudaStream_t stream) {
-    const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+void ftl::cuda::triangle_render1(const cv::cuda::GpuMat &depth_in, cv::cuda::GpuMat &depth_out, const cv::cuda::GpuMat &screen, const Parameters &params, cudaStream_t stream) {
+	static constexpr int THREADS_X = 8;
+	static constexpr int THREADS_Y = 8;
+
+	const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
+	
+	depth_out.create(params.camera.height, params.camera.width, CV_32S);
 
-	triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params);
+	triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in.ptr<short>(), depth_out.ptr<int>(), screen.ptr<short2>(), depth_in.cols, depth_in.rows, depth_in.step1(), screen.step1()/2, depth_out.step1(), params);
     cudaSafeCall( cudaGetLastError() );
 }
 
@@ -218,8 +226,6 @@ void ftl::cuda::merge_convert_depth(TextureObject<int> &depth_in, TextureObject<
 
 // ==== Reverse Verify Result ==================================================
 
-// ==== Merge convert ===========
-
 __global__ void reverse_check_kernel(
 	TextureObject<float> depth_in,
 	TextureObject<float> depth_original,
@@ -234,6 +240,7 @@ __global__ void reverse_check_kernel(
 	if (x < 0 || x >= depth_in.width() || y < 0 || y >= depth_in.height()) return;
 
 	float d = depth_in.tex2D(x,y);
+	
 	float3 campos = transformR * vintrin.screenToCam(x,y,d);
 	int2 spos = ointrin.camToScreen<int2>(campos);
 	int ox = spos.x;
@@ -242,11 +249,14 @@ __global__ void reverse_check_kernel(
 	if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) {
 		float d2 = depth_original.tex2D(ox,oy);
 
-		if (d2 < ointrin.maxDepth && d2 - campos.z > 0.002f) {
+		if (d2 < ointrin.maxDepth && d2 - campos.z > d2*0.001f) {
 			//printf("Original %f, %f\n", d2, campos.z);
-			depth_in(x,y) = 0.0f; //(transform * ointrin.screenToCam(ox,oy,d2)).z;
+			//depth_in(x,y) = 1.5f; //(transform * ointrin.screenToCam(ox,oy,d2)).z;
+			d = 0.0f;
 		}
 	}
+
+	depth_in(x,y) = d;
 }
 
 void ftl::cuda::reverse_verify(TextureObject<float> &depth_in, TextureObject<float> &depth_original, const float4x4 &transformR, const float4x4 &transform, const ftl::rgbd::Camera &vintrin, const ftl::rgbd::Camera &ointrin, cudaStream_t stream) {