diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index 00cb8554f5b4dceabe029aaf9c4ea78f499fe885..ca11bf526ef0e6e141861a79a15e2bf656106a5c 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -423,14 +423,14 @@ void ftl::gui::Camera::_downloadFrames(ftl::rgbd::Frame *frame) {
 	} else if (channel_ == Channel::ColourNormals && frame->hasChannel(Channel::Depth)) {
 		// We can calculate normals here.
 		ftl::cuda::normals(
-			frame->createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			frame->createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())),
 			frame->createTexture<float>(Channel::Depth),
 			frame->getLeftCamera(), 0
 		);
 
 		frame->create<GpuMat>(Channel::ColourNormals, ftl::rgbd::Format<uchar4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())).setTo(cv::Scalar(0,0,0,0), cv::cuda::Stream::Null());
 
-		ftl::cuda::normal_visualise(frame->getTexture<float4>(Channel::Normals), frame->createTexture<uchar4>(Channel::ColourNormals),
+		ftl::cuda::normal_visualise(frame->getTexture<half4>(Channel::Normals), frame->createTexture<uchar4>(Channel::ColourNormals),
 				make_float3(0.3f,0.3f,0.3f),
 				make_uchar4(200,200,200,255),
 				make_uchar4(50,50,50,255), 0);
diff --git a/components/common/cpp/include/ftl/cuda_buffer.hpp b/components/common/cpp/include/ftl/cuda_buffer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..45abe03c733d222deec030873753dfa7c6b7c900
--- /dev/null
+++ b/components/common/cpp/include/ftl/cuda_buffer.hpp
@@ -0,0 +1,213 @@
+#ifndef _FTL_CUDA_BUFFER_HPP_
+#define _FTL_CUDA_BUFFER_HPP_
+
+namespace ftl {
+namespace cuda {
+
+class BufferBase {
+	public:
+	__host__ __device__ BufferBase()
+			: pitch_(0), pitch2_(0), width_(0), height_(0),
+			  ptr_(nullptr), needsfree_(false),
+			  cvType_(-1) {};
+	~BufferBase();
+
+	BufferBase(const BufferBase &)=delete;
+	BufferBase &operator=(const BufferBase &)=delete;
+
+	BufferBase(BufferBase &&);
+	BufferBase &operator=(BufferBase &&);
+
+	inline size_t pitch() const { return pitch_; }
+	inline size_t pixelPitch() const { return pitch2_; }
+	inline uchar *devicePtr() const { return ptr_; };
+	__host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; }
+	__host__ __device__ inline int width() const { return width_; }
+	__host__ __device__ inline int height() const { return height_; }
+
+	void upload(const cv::Mat &, cudaStream_t stream=0);
+	void download(cv::Mat &, cudaStream_t stream=0) const;
+	
+	__host__ void free();
+
+	inline int cvType() const { return cvType_; }
+	
+	protected:
+	size_t pitch_;
+	size_t pitch2_; 		// in T units
+	int width_;
+	int height_;
+	uchar *ptr_;			// Device memory pointer
+	bool needsfree_;		// We manage memory, so free it
+	int cvType_;				// Used to validate casting
+};
+
+
+template <typename T>
+class Buffer : public BufferBase {
+	public:
+	typedef T type;
+
+	__host__ __device__ Buffer() : BufferBase() {};
+	explicit Buffer(const cv::cuda::GpuMat &d);
+	explicit Buffer(const cv::cuda::PtrStepSz<T> &d);
+	Buffer(T *ptr, int pitch, int width, int height);
+	Buffer(size_t width, size_t height);
+	Buffer(const Buffer<T> &t);
+	__host__ __device__ Buffer(Buffer<T> &&);
+	~Buffer();
+
+	Buffer<T> &operator=(const Buffer<T> &);
+	__host__ __device__ Buffer<T> &operator=(Buffer<T> &&);
+
+	operator cv::cuda::GpuMat();
+
+	void create(const cv::Size &);
+	void create(int w, int h);
+
+	__host__ __device__ T *devicePtr() const { return (T*)(ptr_); };
+	__host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; }
+
+	__host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+	__host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+
+	/**
+	 * Cast a base texture object to this type of texture object. If the
+	 * underlying pixel types do not match then a bad_cast exception is thrown.
+	 */
+	static Buffer<T> &cast(BufferBase &);
+};
+
+#ifndef __CUDACC__
+template <typename T>
+Buffer<T> &Buffer<T>::cast(BufferBase &b) {
+	if (b.cvType() != ftl::traits::OpenCVType<T>::value) {
+		//LOG(ERROR) << "Bad cast of texture object";
+		throw std::bad_cast();
+	}
+	return reinterpret_cast<Buffer<T>&>(b);
+}
+
+/**
+ * Create a 2D array texture from an OpenCV GpuMat object.
+ */
+template <typename T>
+Buffer<T>::Buffer(const cv::cuda::GpuMat &d) {
+	pitch_ = d.step;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = d.data;
+	width_ = d.cols;
+	height_ = d.rows;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+#endif  // __CUDACC__
+
+template <typename T>
+Buffer<T>::Buffer(const cv::cuda::PtrStepSz<T> &d) {
+	pitch_ = d.step;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = d.data;
+	width_ = d.cols;
+	height_ = d.rows;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+template <typename T>
+Buffer<T>::Buffer(T *ptr, int pitch, int width, int height) {
+	pitch_ = pitch;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = ptr;
+	width_ = width;
+	height_ = height;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+template <typename T>
+Buffer<T>::Buffer(size_t width, size_t height) {
+	cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height);
+	width_ = (int)width;
+	height_ = (int)height;
+	needsfree_ = true;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+#ifndef __CUDACC__
+template <typename T>
+void Buffer<T>::create(const cv::Size &s) {
+	create(s.width, s.height);
+}
+
+template <typename T>
+void Buffer<T>::create(int w, int h) {
+	if (width_ != w || height_ != h) {
+		*this = std::move(Buffer<T>(w, h));
+	}
+}
+#endif
+
+template <typename T>
+Buffer<T>::Buffer(const Buffer<T> &p) {
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsfree_ = false;
+}
+
+template <typename T>
+Buffer<T>::Buffer(Buffer<T> &&p) {
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	needsfree_ = p.needsfree_;
+	p.needsfree_ = false;
+	p.ptr_ = nullptr;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+template <typename T>
+Buffer<T> &Buffer<T>::operator=(const Buffer<T> &p) {
+	free();
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsfree_ = false;
+	return *this;
+}
+
+template <typename T>
+Buffer<T> &Buffer<T>::operator=(Buffer<T> &&p) {
+	free();
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	needsfree_ = p.needsfree_;
+	p.needsfree_ = false;
+	p.ptr_ = nullptr;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	return *this;
+}
+
+template <typename T>
+Buffer<T>::~Buffer() {
+	free();
+}
+
+}
+}
+
+#endif
diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp
index c74c4faa0868c3e352a41e77369daa2ea24bd78e..5c3cc9f484ecbaf615cf44469e3c37b636a4f7e6 100644
--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -9,6 +9,9 @@
 #include <ftl/cuda_util.hpp>
 #include <opencv2/core/cuda.hpp>
 #include <opencv2/core/cuda/common.hpp>
+#include <ftl/cuda_half.hpp>
+#include <ftl/cuda_texture.hpp>
+#include <ftl/cuda_buffer.hpp>
 
 #ifndef __CUDACC__
 #include <exception>
@@ -33,355 +36,6 @@ void setDevice(int);
 
 void setDevice();
 
-template <typename T>
-struct Float;
-
-template <> struct Float<float> { typedef float type; };
-template <> struct Float<int> { typedef float type; };
-template <> struct Float<float4> { typedef float4 type; };
-template <> struct Float<uchar4> { typedef float4 type; };
-template <> struct Float<uint8_t> { typedef float type; };
-template <> struct Float<short2> { typedef float2 type; };
-template <> struct Float<short> { typedef float type; };
-
-template <typename T>
-struct ScaleValue;
-
-template <> struct ScaleValue<uchar4> { static constexpr float value = 255.0f; };
-template <> struct ScaleValue<uint8_t> { static constexpr float value = 255.0f; };
-template <> struct ScaleValue<float> { static constexpr float value = 1.0f; };
-template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; };
-template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; };
-
-/**
- * Represent a CUDA texture object. Instances of this class can be used on both
- * host and device. A texture object base cannot be constructed directly, it
- * must be constructed via a template TextureObject class.
- */
-class TextureObjectBase {
-	public:
-	__host__ __device__ TextureObjectBase()
-			: texobj_(0), pitch_(0), pitch2_(0), width_(0), height_(0),
-			  ptr_(nullptr), needsfree_(false), needsdestroy_(false),
-			  cvType_(-1) {};
-	~TextureObjectBase();
-
-	// Remove ability to copy object directly, instead must use
-	// templated derivative TextureObject.
-	TextureObjectBase(const TextureObjectBase &)=delete;
-	TextureObjectBase &operator=(const TextureObjectBase &)=delete;
-
-	TextureObjectBase(TextureObjectBase &&);
-	TextureObjectBase &operator=(TextureObjectBase &&);
-
-	inline size_t pitch() const { return pitch_; }
-	inline size_t pixelPitch() const { return pitch2_; }
-	inline uchar *devicePtr() const { return ptr_; };
-	__host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; }
-	__host__ __device__ inline int width() const { return width_; }
-	__host__ __device__ inline int height() const { return height_; }
-	__host__ __device__ inline cudaTextureObject_t cudaTexture() const { return texobj_; }
-
-	void upload(const cv::Mat &, cudaStream_t stream=0);
-	void download(cv::Mat &, cudaStream_t stream=0) const;
-	
-	__host__ void free();
-
-	inline int cvType() const { return cvType_; }
-	
-	protected:
-	cudaTextureObject_t texobj_;
-	size_t pitch_;
-	size_t pitch2_; 		// in T units
-	int width_;
-	int height_;
-	uchar *ptr_;			// Device memory pointer
-	bool needsfree_;		// We manage memory, so free it
-	bool needsdestroy_;		// The texture object needs to be destroyed
-	int cvType_;				// Used to validate casting
-};
-
-/**
- * Create and manage CUDA texture objects with a particular pixel data type.
- * Note: it is not possible to create texture objects for certain types,
- * specificially for 3 channel types.
- */
-template <typename T>
-class TextureObject : public TextureObjectBase {
-	public:
-	typedef T type;
-
-	static_assert((16u % sizeof(T)) == 0, "Channel format must be aligned with 16 bytes");
-
-	__host__ __device__ TextureObject() : TextureObjectBase() {};
-	explicit TextureObject(const cv::cuda::GpuMat &d, bool interpolated=false);
-	explicit TextureObject(const cv::cuda::PtrStepSz<T> &d);
-	TextureObject(T *ptr, int pitch, int width, int height);
-	TextureObject(size_t width, size_t height);
-	TextureObject(const TextureObject<T> &t);
-	__host__ __device__ TextureObject(TextureObject<T> &&);
-	~TextureObject();
-
-	TextureObject<T> &operator=(const TextureObject<T> &);
-	__host__ __device__ TextureObject<T> &operator=(TextureObject<T> &&);
-
-	operator cv::cuda::GpuMat();
-
-	void create(const cv::Size &);
-	void create(int w, int h);
-
-	__host__ __device__ T *devicePtr() const { return (T*)(ptr_); };
-	__host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; }
-
-	#ifdef __CUDACC__
-	__device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); }
-	__device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); }
-	__device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; }
-	#endif
-
-	__host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
-	__host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
-
-	/**
-	 * Cast a base texture object to this type of texture object. If the
-	 * underlying pixel types do not match then a bad_cast exception is thrown.
-	 */
-	static TextureObject<T> &cast(TextureObjectBase &);
-};
-
-#ifndef __CUDACC__
-template <typename T>
-TextureObject<T> &TextureObject<T>::cast(TextureObjectBase &b) {
-	if (b.cvType() != ftl::traits::OpenCVType<T>::value) {
-		//LOG(ERROR) << "Bad cast of texture object";
-		throw std::bad_cast();
-	}
-	return reinterpret_cast<TextureObject<T>&>(b);
-}
-
-/**
- * Create a 2D array texture from an OpenCV GpuMat object.
- */
-template <typename T>
-TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) {
-	// GpuMat must have correct data type
-	//CHECK(d.type() == ftl::traits::OpenCVType<T>::value);
-
-	cudaResourceDesc resDesc;
-	memset(&resDesc, 0, sizeof(resDesc));
-	resDesc.resType = cudaResourceTypePitch2D;
-	resDesc.res.pitch2D.devPtr = d.data;
-	resDesc.res.pitch2D.pitchInBytes = d.step;
-	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-	resDesc.res.pitch2D.width = d.cols;
-	resDesc.res.pitch2D.height = d.rows;
-
-	cudaTextureDesc texDesc;
-	// cppcheck-suppress memsetClassFloat
-	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.readMode = (interpolated) ? cudaReadModeNormalizedFloat : cudaReadModeElementType;
-	if (interpolated) texDesc.filterMode = cudaFilterModeLinear;
-
-	cudaTextureObject_t tex = 0;
-	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-	texobj_ = tex;
-	pitch_ = d.step;
-	pitch2_ = pitch_ / sizeof(T);
-	ptr_ = d.data;
-	width_ = d.cols;
-	height_ = d.rows;
-	needsfree_ = false;
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsdestroy_ = true;
-}
-
-#endif  // __CUDACC__
-
-/**
- * Create a 2D array texture from an OpenCV GpuMat object.
- */
-template <typename T>
-TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) {
-	cudaResourceDesc resDesc;
-	memset(&resDesc, 0, sizeof(resDesc));
-	resDesc.resType = cudaResourceTypePitch2D;
-	resDesc.res.pitch2D.devPtr = d.data;
-	resDesc.res.pitch2D.pitchInBytes = d.step;
-	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-	resDesc.res.pitch2D.width = d.cols;
-	resDesc.res.pitch2D.height = d.rows;
-
-	cudaTextureDesc texDesc;
-	// cppcheck-suppress memsetClassFloat
-	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.readMode = cudaReadModeElementType;
-	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
-
-	cudaTextureObject_t tex = 0;
-	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-	texobj_ = tex;
-	pitch_ = d.step;
-	pitch2_ = pitch_ / sizeof(T);
-	ptr_ = d.data;
-	width_ = d.cols;
-	height_ = d.rows;
-	needsfree_ = false;
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsdestroy_ = true;
-}
-
-/**
- * Create a 2D array texture object using a cudaMallocPitch device pointer.
- * The texture object returned must be destroyed by the caller.
- */
-template <typename T>
-TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
-	cudaResourceDesc resDesc;
-	memset(&resDesc, 0, sizeof(resDesc));
-	resDesc.resType = cudaResourceTypePitch2D;
-	resDesc.res.pitch2D.devPtr = ptr;
-	resDesc.res.pitch2D.pitchInBytes = pitch;
-	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-	resDesc.res.pitch2D.width = width;
-	resDesc.res.pitch2D.height = height;
-
-	cudaTextureDesc texDesc;
-	// cppcheck-suppress memsetClassFloat
-	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.readMode = cudaReadModeElementType;
-	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
-
-	cudaTextureObject_t tex = 0;
-	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-	texobj_ = tex;
-	pitch_ = pitch;
-	pitch2_ = pitch_ / sizeof(T);
-	ptr_ = ptr;
-	width_ = width;
-	height_ = height;
-	needsfree_ = false;
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsdestroy_ = true;
-}
-
-template <typename T>
-TextureObject<T>::TextureObject(size_t width, size_t height) {
-	cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height);
-	cudaTextureObject_t tex = 0;
-
-	// Must be an even
-	//if (!(sizeof(T) & 0x1)) {
-		cudaResourceDesc resDesc;
-		memset(&resDesc, 0, sizeof(resDesc));
-		resDesc.resType = cudaResourceTypePitch2D;
-		resDesc.res.pitch2D.devPtr = ptr_;
-		resDesc.res.pitch2D.pitchInBytes = pitch_;
-		resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-		resDesc.res.pitch2D.width = width;
-		resDesc.res.pitch2D.height = height;
-
-		cudaTextureDesc texDesc;
-		// cppcheck-suppress memsetClassFloat
-		memset(&texDesc, 0, sizeof(texDesc));
-		texDesc.readMode = cudaReadModeElementType;
-		//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
-		cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-	//}
-
-	texobj_ = tex;
-	width_ = (int)width;
-	height_ = (int)height;
-	needsfree_ = true;
-	pitch2_ = pitch_ / sizeof(T);
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsdestroy_ = true;
-}
-
-#ifndef __CUDACC__
-template <typename T>
-void TextureObject<T>::create(const cv::Size &s) {
-	create(s.width, s.height);
-}
-
-template <typename T>
-void TextureObject<T>::create(int w, int h) {
-	if (width_ != w || height_ != h) {
-		*this = std::move(TextureObject<T>(w, h));
-	}
-}
-#endif
-
-template <typename T>
-TextureObject<T>::TextureObject(const TextureObject<T> &p) {
-	texobj_ = p.texobj_;
-	ptr_ = p.ptr_;
-	width_ = p.width_;
-	height_ = p.height_;
-	pitch_ = p.pitch_;
-	pitch2_ = pitch_ / sizeof(T);
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsfree_ = false;
-	needsdestroy_ = false;
-}
-
-template <typename T>
-TextureObject<T>::TextureObject(TextureObject<T> &&p) {
-	texobj_ = p.texobj_;
-	ptr_ = p.ptr_;
-	width_ = p.width_;
-	height_ = p.height_;
-	pitch_ = p.pitch_;
-	pitch2_ = pitch_ / sizeof(T);
-	needsfree_ = p.needsfree_;
-	needsdestroy_ = p.needsdestroy_;
-	p.texobj_ = 0;
-	p.needsfree_ = false;
-	p.needsdestroy_ = false;
-	p.ptr_ = nullptr;
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-}
-
-template <typename T>
-TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) {
-	free();
-	texobj_ = p.texobj_;
-	ptr_ = p.ptr_;
-	width_ = p.width_;
-	height_ = p.height_;
-	pitch_ = p.pitch_;
-	pitch2_ = pitch_ / sizeof(T);
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	needsfree_ = false;
-	needsdestroy_ = false;
-	return *this;
-}
-
-template <typename T>
-TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) {
-	free();
-	texobj_ = p.texobj_;
-	ptr_ = p.ptr_;
-	width_ = p.width_;
-	height_ = p.height_;
-	pitch_ = p.pitch_;
-	pitch2_ = pitch_ / sizeof(T);
-	needsfree_ = p.needsfree_;
-	needsdestroy_ = p.needsdestroy_;
-	p.texobj_ = 0;
-	p.needsfree_ = false;
-	p.needsdestroy_ = false;
-	p.ptr_ = nullptr;
-	cvType_ = ftl::traits::OpenCVType<T>::value;
-	return *this;
-}
-
-template <typename T>
-TextureObject<T>::~TextureObject() {
-	//if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
-	//if (needsfree_) cudaFree(ptr_);
-	free();
-}
-
 /**
  * Read a texture value using coordinates in the range of `b`, but from the
  * texture `a` which may have a different resolution.
diff --git a/components/common/cpp/include/ftl/cuda_half.hpp b/components/common/cpp/include/ftl/cuda_half.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02b2d5ba561ef928cab165554d923cdd1f990eb
--- /dev/null
+++ b/components/common/cpp/include/ftl/cuda_half.hpp
@@ -0,0 +1,110 @@
+#ifndef _FTL_CUDA_HALF_HPP_
+#define _FTL_CUDA_HALF_HPP_
+
+#include <cuda_fp16.h>
+
+struct half4 {
+	half2 a;
+	half2 b;
+};
+
+static_assert(sizeof(half4) == 8, "Incorrect half4 size");
+
+template <>
+__host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half4>() {
+	return cudaCreateChannelDesc<short4>();
+}
+
+#ifdef __CUDACC__
+
+// half4 functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ half4 make_half4(half x, half y, half z, half w) {
+	half4 h;
+	h.a.x = x;
+	h.a.y = y;
+	h.b.x = z;
+	h.b.y = w;
+	return h;
+}
+
+inline __device__ half4 make_half4(half2 a, half2 b) {
+	half4 h;
+	h.a = a;
+	h.b = b;
+	return h;
+}
+
+inline __device__ half4 make_half4(float x, float y, float z, float w) {
+	half4 h;
+	h.a = __floats2half2_rn(x,y);
+	h.b = __floats2half2_rn(z,w);
+	return h;
+}
+
+inline __device__ half4 make_half4(const float4 &v) {
+	half4 h;
+	h.a = __floats2half2_rn(v.x,v.y);
+	h.b = __floats2half2_rn(v.z,v.w);
+	return h;
+}
+
+inline __device__ half4 make_half4(const float3 &v, float a) {
+	half4 h;
+	h.a = __floats2half2_rn(v.x,v.y);
+	h.b = __floats2half2_rn(v.z,a);
+	return h;
+}
+
+inline __device__ half4 make_half4(float a)
+{
+    return make_half4(a,a,a,a);
+}
+
+inline __device__ float4 make_float4(const half4 &v)
+{
+	union {
+		struct {
+			float2 a;
+			float2 b;
+		} x;
+		float4 f; 
+	};
+	x.a = __half22float2(v.a);
+	x.b = __half22float2(v.b);
+	return f;
+}
+
+inline __device__ float3 make_float3(const half4 &v)
+{
+	union {
+		struct {
+			float2 a;
+			float b;
+		} x;
+		float3 f; 
+	};
+	x.a = __half22float2(v.a);
+	x.b = __half2float(v.b.x);
+	return f;
+}
+
+
+
+// === Texture overloads =======================================================
+
+template <>
+__device__ inline half4 tex2D<half4>(cudaTextureObject_t o, float x, float y) {
+	union U {
+		__device__ inline U(const short4 &s) : t(s) {}
+		short4 t;
+		half4 h;
+	};
+	return U(tex2D<short4>(o, x, y)).h;
+}
+
+#endif
+
+
+#endif
\ No newline at end of file
diff --git a/components/common/cpp/include/ftl/cuda_texture.hpp b/components/common/cpp/include/ftl/cuda_texture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e103b049750b0a39087fcd4dbd21100ddb7815b5
--- /dev/null
+++ b/components/common/cpp/include/ftl/cuda_texture.hpp
@@ -0,0 +1,361 @@
+#ifndef _FTL_CUDA_TEXTURE_HPP_
+#define _FTL_CUDA_TEXTURE_HPP_
+
+namespace ftl {
+namespace cuda {
+
+template <typename T>
+struct Float;
+
+template <> struct Float<float> { typedef float type; };
+template <> struct Float<int> { typedef float type; };
+template <> struct Float<float4> { typedef float4 type; };
+template <> struct Float<half4> { typedef float4 type; };
+template <> struct Float<uchar4> { typedef float4 type; };
+template <> struct Float<uint8_t> { typedef float type; };
+template <> struct Float<short2> { typedef float2 type; };
+template <> struct Float<short> { typedef float type; };
+
+template <typename T>
+struct ScaleValue;
+
+template <> struct ScaleValue<uchar4> { static constexpr float value = 255.0f; };
+template <> struct ScaleValue<uint8_t> { static constexpr float value = 255.0f; };
+template <> struct ScaleValue<float> { static constexpr float value = 1.0f; };
+template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; };
+template <> struct ScaleValue<half4> { static constexpr float value = 1.0f; };
+template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; };
+
+/**
+ * Represent a CUDA texture object. Instances of this class can be used on both
+ * host and device. A texture object base cannot be constructed directly, it
+ * must be constructed via a template TextureObject class.
+ */
+class TextureObjectBase {
+	public:
+	__host__ __device__ TextureObjectBase()
+			: texobj_(0), pitch_(0), pitch2_(0), width_(0), height_(0),
+			  ptr_(nullptr), needsfree_(false), needsdestroy_(false),
+			  cvType_(-1) {};
+	~TextureObjectBase();
+
+	// Remove ability to copy object directly, instead must use
+	// templated derivative TextureObject.
+	TextureObjectBase(const TextureObjectBase &)=delete;
+	TextureObjectBase &operator=(const TextureObjectBase &)=delete;
+
+	TextureObjectBase(TextureObjectBase &&);
+	TextureObjectBase &operator=(TextureObjectBase &&);
+
+	inline size_t pitch() const { return pitch_; }
+	inline size_t pixelPitch() const { return pitch2_; }
+	inline uchar *devicePtr() const { return ptr_; };
+	__host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; }
+	__host__ __device__ inline int width() const { return width_; }
+	__host__ __device__ inline int height() const { return height_; }
+	__host__ __device__ inline cudaTextureObject_t cudaTexture() const { return texobj_; }
+
+	void upload(const cv::Mat &, cudaStream_t stream=0);
+	void download(cv::Mat &, cudaStream_t stream=0) const;
+	
+	__host__ void free();
+
+	inline int cvType() const { return cvType_; }
+	
+	protected:
+	cudaTextureObject_t texobj_;
+	size_t pitch_;
+	size_t pitch2_; 		// in T units
+	int width_;
+	int height_;
+	uchar *ptr_;			// Device memory pointer
+	bool needsfree_;		// We manage memory, so free it
+	bool needsdestroy_;		// The texture object needs to be destroyed
+	int cvType_;				// Used to validate casting
+};
+
+/**
+ * Create and manage CUDA texture objects with a particular pixel data type.
+ * Note: it is not possible to create texture objects for certain types,
+ * specificially for 3 channel types.
+ */
+template <typename T>
+class TextureObject : public TextureObjectBase {
+	public:
+	typedef T type;
+
+	static_assert((16u % sizeof(T)) == 0, "Channel format must be aligned with 16 bytes");
+
+	__host__ __device__ TextureObject() : TextureObjectBase() {};
+	explicit TextureObject(const cv::cuda::GpuMat &d, bool interpolated=false);
+	explicit TextureObject(const cv::cuda::PtrStepSz<T> &d);
+	TextureObject(T *ptr, int pitch, int width, int height);
+	TextureObject(size_t width, size_t height);
+	TextureObject(const TextureObject<T> &t);
+	__host__ __device__ TextureObject(TextureObject<T> &&);
+	~TextureObject();
+
+	TextureObject<T> &operator=(const TextureObject<T> &);
+	__host__ __device__ TextureObject<T> &operator=(TextureObject<T> &&);
+
+	operator cv::cuda::GpuMat();
+
+	void create(const cv::Size &);
+	void create(int w, int h);
+
+	__host__ __device__ T *devicePtr() const { return (T*)(ptr_); };
+	__host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; }
+
+	#ifdef __CUDACC__
+	__device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); }
+	__device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); }
+	__device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; }
+	#endif
+
+	__host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+	__host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
+
+	/**
+	 * Cast a base texture object to this type of texture object. If the
+	 * underlying pixel types do not match then a bad_cast exception is thrown.
+	 */
+	static TextureObject<T> &cast(TextureObjectBase &);
+};
+
+#ifndef __CUDACC__
+template <typename T>
+TextureObject<T> &TextureObject<T>::cast(TextureObjectBase &b) {
+	if (b.cvType() != ftl::traits::OpenCVType<T>::value) {
+		//LOG(ERROR) << "Bad cast of texture object";
+		throw std::bad_cast();
+	}
+	return reinterpret_cast<TextureObject<T>&>(b);
+}
+
+/**
+ * Create a 2D array texture from an OpenCV GpuMat object.
+ */
+template <typename T>
+TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) {
+	// GpuMat must have correct data type
+	//CHECK(d.type() == ftl::traits::OpenCVType<T>::value);
+
+	cudaResourceDesc resDesc;
+	memset(&resDesc, 0, sizeof(resDesc));
+	resDesc.resType = cudaResourceTypePitch2D;
+	resDesc.res.pitch2D.devPtr = d.data;
+	resDesc.res.pitch2D.pitchInBytes = d.step;
+	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+	resDesc.res.pitch2D.width = d.cols;
+	resDesc.res.pitch2D.height = d.rows;
+
+	cudaTextureDesc texDesc;
+	// cppcheck-suppress memsetClassFloat
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.readMode = (interpolated) ? cudaReadModeNormalizedFloat : cudaReadModeElementType;
+	if (interpolated) texDesc.filterMode = cudaFilterModeLinear;
+
+	cudaTextureObject_t tex = 0;
+	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+	texobj_ = tex;
+	pitch_ = d.step;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = d.data;
+	width_ = d.cols;
+	height_ = d.rows;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsdestroy_ = true;
+}
+
+#endif  // __CUDACC__
+
+/**
+ * Create a 2D array texture from an OpenCV GpuMat object.
+ */
+template <typename T>
+TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) {
+	cudaResourceDesc resDesc;
+	memset(&resDesc, 0, sizeof(resDesc));
+	resDesc.resType = cudaResourceTypePitch2D;
+	resDesc.res.pitch2D.devPtr = d.data;
+	resDesc.res.pitch2D.pitchInBytes = d.step;
+	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+	resDesc.res.pitch2D.width = d.cols;
+	resDesc.res.pitch2D.height = d.rows;
+
+	cudaTextureDesc texDesc;
+	// cppcheck-suppress memsetClassFloat
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.readMode = cudaReadModeElementType;
+	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
+
+	cudaTextureObject_t tex = 0;
+	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+	texobj_ = tex;
+	pitch_ = d.step;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = d.data;
+	width_ = d.cols;
+	height_ = d.rows;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsdestroy_ = true;
+}
+
+/**
+ * Create a 2D array texture object using a cudaMallocPitch device pointer.
+ * The texture object returned must be destroyed by the caller.
+ */
+template <typename T>
+TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
+	cudaResourceDesc resDesc;
+	memset(&resDesc, 0, sizeof(resDesc));
+	resDesc.resType = cudaResourceTypePitch2D;
+	resDesc.res.pitch2D.devPtr = ptr;
+	resDesc.res.pitch2D.pitchInBytes = pitch;
+	resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+	resDesc.res.pitch2D.width = width;
+	resDesc.res.pitch2D.height = height;
+
+	cudaTextureDesc texDesc;
+	// cppcheck-suppress memsetClassFloat
+	memset(&texDesc, 0, sizeof(texDesc));
+	texDesc.readMode = cudaReadModeElementType;
+	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
+
+	cudaTextureObject_t tex = 0;
+	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+	texobj_ = tex;
+	pitch_ = pitch;
+	pitch2_ = pitch_ / sizeof(T);
+	ptr_ = ptr;
+	width_ = width;
+	height_ = height;
+	needsfree_ = false;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsdestroy_ = true;
+}
+
+template <typename T>
+TextureObject<T>::TextureObject(size_t width, size_t height) {
+	cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height);
+	cudaTextureObject_t tex = 0;
+
+	// Must be an even
+	//if (!(sizeof(T) & 0x1)) {
+		cudaResourceDesc resDesc;
+		memset(&resDesc, 0, sizeof(resDesc));
+		resDesc.resType = cudaResourceTypePitch2D;
+		resDesc.res.pitch2D.devPtr = ptr_;
+		resDesc.res.pitch2D.pitchInBytes = pitch_;
+		resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+		resDesc.res.pitch2D.width = width;
+		resDesc.res.pitch2D.height = height;
+
+		cudaTextureDesc texDesc;
+		// cppcheck-suppress memsetClassFloat
+		memset(&texDesc, 0, sizeof(texDesc));
+		texDesc.readMode = cudaReadModeElementType;
+		//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
+		cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+	//}
+
+	texobj_ = tex;
+	width_ = (int)width;
+	height_ = (int)height;
+	needsfree_ = true;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsdestroy_ = true;
+}
+
+#ifndef __CUDACC__
+template <typename T>
+void TextureObject<T>::create(const cv::Size &s) {
+	create(s.width, s.height);
+}
+
+template <typename T>
+void TextureObject<T>::create(int w, int h) {
+	if (width_ != w || height_ != h) {
+		*this = std::move(TextureObject<T>(w, h));
+	}
+}
+#endif
+
+template <typename T>
+TextureObject<T>::TextureObject(const TextureObject<T> &p) {
+	texobj_ = p.texobj_;
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsfree_ = false;
+	needsdestroy_ = false;
+}
+
+template <typename T>
+TextureObject<T>::TextureObject(TextureObject<T> &&p) {
+	texobj_ = p.texobj_;
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	needsfree_ = p.needsfree_;
+	needsdestroy_ = p.needsdestroy_;
+	p.texobj_ = 0;
+	p.needsfree_ = false;
+	p.needsdestroy_ = false;
+	p.ptr_ = nullptr;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+}
+
+template <typename T>
+TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) {
+	free();
+	texobj_ = p.texobj_;
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	needsfree_ = false;
+	needsdestroy_ = false;
+	return *this;
+}
+
+template <typename T>
+TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) {
+	free();
+	texobj_ = p.texobj_;
+	ptr_ = p.ptr_;
+	width_ = p.width_;
+	height_ = p.height_;
+	pitch_ = p.pitch_;
+	pitch2_ = pitch_ / sizeof(T);
+	needsfree_ = p.needsfree_;
+	needsdestroy_ = p.needsdestroy_;
+	p.texobj_ = 0;
+	p.needsfree_ = false;
+	p.needsdestroy_ = false;
+	p.ptr_ = nullptr;
+	cvType_ = ftl::traits::OpenCVType<T>::value;
+	return *this;
+}
+
+template <typename T>
+TextureObject<T>::~TextureObject() {
+	//if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
+	//if (needsfree_) cudaFree(ptr_);
+	free();
+}
+
+}
+}
+
+#endif
diff --git a/components/common/cpp/include/ftl/traits.hpp b/components/common/cpp/include/ftl/traits.hpp
index 71668359d1b0bfa036c29fb9f6d4f73ee0d2d5a7..8abf07fc01c06e2962721aff95d2a656c136050e 100644
--- a/components/common/cpp/include/ftl/traits.hpp
+++ b/components/common/cpp/include/ftl/traits.hpp
@@ -3,6 +3,7 @@
 
 #include <opencv2/core.hpp>
 #include <ftl/cuda_util.hpp>
+#include <ftl/cuda_half.hpp>
 
 namespace ftl {
 namespace traits {
@@ -37,6 +38,7 @@ template <> struct OpenCVType<float> { static constexpr int value = CV_32FC1; };
 template <> struct OpenCVType<float2> { static constexpr int value = CV_32FC2; };
 template <> struct OpenCVType<float3> { static constexpr int value = CV_32FC3; };
 template <> struct OpenCVType<float4> { static constexpr int value = CV_32FC4; };
+template <> struct OpenCVType<half4> { static constexpr int value = CV_16FC4; };
 
 }
 }
diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp
index 1fb7b68e5e0a9c3d07250d40dc65c42e4d2a2de5..779b9feb4b5fdecc81f25e30f81a423b1833bb38 100644
--- a/components/common/cpp/src/cuda_common.cpp
+++ b/components/common/cpp/src/cuda_common.cpp
@@ -3,6 +3,7 @@
 #include <ftl/cuda_common.hpp>
 
 using ftl::cuda::TextureObjectBase;
+using ftl::cuda::BufferBase;
 
 static int dev_to_use = 0;
 static int dev_count = 0;
@@ -116,58 +117,54 @@ void TextureObjectBase::download(cv::Mat &m, cudaStream_t stream) const {
 	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * m.elemSize(), m.rows, cudaMemcpyDeviceToHost, stream));
 }
 
-/*template <>
-void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream) {
-    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar4), m.rows, cudaMemcpyHostToDevice, stream));
-}
 
-template <>
-void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream) {
-    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float), m.rows, cudaMemcpyHostToDevice, stream));
-}
 
-template <>
-void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream) {
-    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float2), m.rows, cudaMemcpyHostToDevice, stream));
+BufferBase::~BufferBase() {
+	free();
 }
 
-template <>
-void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream) {
-    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float4), m.rows, cudaMemcpyHostToDevice, stream));
-}
+BufferBase::BufferBase(BufferBase &&o) {
+	needsfree_ = o.needsfree_;
+	ptr_ = o.ptr_;
+	cvType_ = o.cvType_;
+	width_ = o.width_;
+	height_ = o.height_;
+	pitch_ = o.pitch_;
+	pitch2_ = o.pitch2_;
 
-template <>
-void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream) {
-    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar), m.rows, cudaMemcpyHostToDevice, stream));
+	o.ptr_ = nullptr;
+	o.needsfree_ = false;
 }
 
+BufferBase &BufferBase::operator=(BufferBase &&o) {
+	free();
 
-template <>
-void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const {
-	m.create(height(), width(), CV_8UC4);
-	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar4), m.rows, cudaMemcpyDeviceToHost, stream));
-}
+	needsfree_ = o.needsfree_;
+	ptr_ = o.ptr_;
+	cvType_ = o.cvType_;
+	width_ = o.width_;
+	height_ = o.height_;
+	pitch_ = o.pitch_;
+	pitch2_ = o.pitch2_;
 
-template <>
-void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const {
-	m.create(height(), width(), CV_32FC1);
-	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float), m.rows, cudaMemcpyDeviceToHost, stream));
+	o.ptr_ = nullptr;
+	o.needsfree_ = false;
+	return *this;
 }
 
-template <>
-void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const {
-	m.create(height(), width(), CV_32FC2);
-	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float2), m.rows, cudaMemcpyDeviceToHost, stream));
+void BufferBase::free() {
+	if (needsfree_) {
+		if (ptr_) cudaFree(ptr_);
+		ptr_ = nullptr;
+		cvType_ = -1;
+	}
 }
 
-template <>
-void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const {
-	m.create(height(), width(), CV_32FC4);
-	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float4), m.rows, cudaMemcpyDeviceToHost, stream));
+void BufferBase::upload(const cv::Mat &m, cudaStream_t stream) {
+    cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * m.elemSize(), m.rows, cudaMemcpyHostToDevice, stream));
 }
 
-template <>
-void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const {
-	m.create(height(), width(), CV_8UC1);
-	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar), m.rows, cudaMemcpyDeviceToHost, stream));
-}*/
+void BufferBase::download(cv::Mat &m, cudaStream_t stream) const {
+	m.create(height(), width(), cvType_);
+	cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * m.elemSize(), m.rows, cudaMemcpyDeviceToHost, stream));
+}
\ No newline at end of file
diff --git a/components/operators/include/ftl/operators/mvmls.hpp b/components/operators/include/ftl/operators/mvmls.hpp
index 1c250e98574d1bac9182aa950ec7fbda594c5b2e..824d88e3d8888699f931eb018b15a4741aa00db3 100644
--- a/components/operators/include/ftl/operators/mvmls.hpp
+++ b/components/operators/include/ftl/operators/mvmls.hpp
@@ -18,7 +18,7 @@ class MultiViewMLS : public ftl::operators::Operator {
 	private:
 	std::vector<ftl::cuda::TextureObject<float4>*> centroid_horiz_;
 	std::vector<ftl::cuda::TextureObject<float4>*> centroid_vert_;
-	std::vector<ftl::cuda::TextureObject<float4>*> normals_horiz_;
+	std::vector<ftl::cuda::TextureObject<half4>*> normals_horiz_;
     std::vector<ftl::cuda::TextureObject<float>*> contributions_;
 };
 
diff --git a/components/operators/include/ftl/operators/normals.hpp b/components/operators/include/ftl/operators/normals.hpp
index 494a0d10d5ab63ed47902b0b7311a71cd01d4a87..a5faaa17645612bce3ab9601638f92058fd4ba57 100644
--- a/components/operators/include/ftl/operators/normals.hpp
+++ b/components/operators/include/ftl/operators/normals.hpp
@@ -52,7 +52,7 @@ class SmoothNormals : public ftl::operators::Operator {
     bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override;
 
 	private:
-	ftl::cuda::TextureObject<float4> temp_;
+	ftl::cuda::TextureObject<half4> temp_;
 
 };
 
diff --git a/components/operators/include/ftl/operators/smoothing.hpp b/components/operators/include/ftl/operators/smoothing.hpp
index bb99865954f4553fab1b7e6c862bd4828dcd5b35..0dc463d2eb4c22563e1e83d032f3d56474d095cb 100644
--- a/components/operators/include/ftl/operators/smoothing.hpp
+++ b/components/operators/include/ftl/operators/smoothing.hpp
@@ -123,7 +123,7 @@ class AggreMLS : public ftl::operators::Operator {
 	private:
 	ftl::cuda::TextureObject<float4> centroid_horiz_;
 	ftl::cuda::TextureObject<float4> centroid_vert_;
-	ftl::cuda::TextureObject<float4> normals_horiz_;
+	ftl::cuda::TextureObject<half4> normals_horiz_;
 
 	ftl::rgbd::Frame temp_;
 };
diff --git a/components/operators/src/correspondence.cu b/components/operators/src/correspondence.cu
index edc333c759221e43247af69748e918aa24f0657a..0b5ea46075627051b86ac25e1617258be22335a9 100644
--- a/components/operators/src/correspondence.cu
+++ b/components/operators/src/correspondence.cu
@@ -346,8 +346,8 @@ __device__ inline int unpackCameraID(short2 p) {
  * Identify which source has the best support region for a given pixel.
  */
 __global__ void best_sources_kernel(
-        TextureObject<float4> normals1,
-        TextureObject<float4> normals2,
+        TextureObject<half4> normals1,
+        TextureObject<half4> normals2,
         TextureObject<uchar4> support1,
         TextureObject<uchar4> support2,
         TextureObject<float> depth1,
@@ -414,8 +414,8 @@ __global__ void best_sources_kernel(
 }
 
 void ftl::cuda::best_sources(
-        ftl::cuda::TextureObject<float4> &normals1,
-        ftl::cuda::TextureObject<float4> &normals2,
+        ftl::cuda::TextureObject<half4> &normals1,
+        ftl::cuda::TextureObject<half4> &normals2,
         ftl::cuda::TextureObject<uchar4> &support1,
         ftl::cuda::TextureObject<uchar4> &support2,
         ftl::cuda::TextureObject<float> &depth1,
@@ -439,8 +439,8 @@ void ftl::cuda::best_sources(
  * Identify which source has the best support region for a given pixel.
  */
  __global__ void aggregate_sources_kernel(
-		TextureObject<float4> n1,
-		TextureObject<float4> n2,
+		TextureObject<half4> n1,
+		TextureObject<half4> n2,
 		TextureObject<float4> c1,
 		TextureObject<float4> c2,
 		TextureObject<float> depth1,
@@ -475,7 +475,7 @@ void ftl::cuda::best_sources(
 				if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f && length(cent2-cent1) < 0.04f) {
 					norm1 += norm2;
 					norm1 /= 2.0f;
-					n1(x,y) = make_float4(norm1, 0.0f);
+					n1(x,y) = make_half4(norm1, 0.0f);
 					cent1 += cent2;
 					cent1 /= 2.0f;
 					c1(x,y) = make_float4(cent1, 0.0f);
@@ -490,8 +490,8 @@ void ftl::cuda::best_sources(
 }
 
 void ftl::cuda::aggregate_sources(
-		ftl::cuda::TextureObject<float4> &n1,
-		ftl::cuda::TextureObject<float4> &n2,
+		ftl::cuda::TextureObject<half4> &n1,
+		ftl::cuda::TextureObject<half4> &n2,
 		ftl::cuda::TextureObject<float4> &c1,
 		ftl::cuda::TextureObject<float4> &c2,
 		ftl::cuda::TextureObject<float> &depth1,
@@ -608,7 +608,7 @@ void ftl::cuda::vis_best_sources(
 // ==== Normalise aggregations =================================================
 
 __global__ void normalise_aggregations_kernel(
-        TextureObject<float4> norms,
+        TextureObject<half4> norms,
         TextureObject<float4> cents,
         TextureObject<float> contribs) {
 
@@ -618,20 +618,20 @@ __global__ void normalise_aggregations_kernel(
     if (x < norms.width() && y < norms.height()) {
         const float contrib = contribs.tex2D((int)x,(int)y);
 
-        const auto a = norms.tex2D((int)x,(int)y);
+        const auto a = make_float3(norms.tex2D((int)x,(int)y));
         const auto b = cents.tex2D(x,y);
         //const float4 normal = normals.tex2D((int)x,(int)y);
 
 		//out(x,y) = (contrib == 0.0f) ? make<B>(a) : make<B>(a / contrib);
 
         if (contrib > 0.0f) {
-            norms(x,y) = a / (contrib+1.0f);
+            norms(x,y) = make_half4(a / (contrib+1.0f), 1.0f);
             cents(x,y) = b / (contrib+1.0f);
         }
     }
 }
 
-void ftl::cuda::normalise_aggregations(TextureObject<float4> &norms, TextureObject<float4> &cents, TextureObject<float> &contribs, cudaStream_t stream) {
+void ftl::cuda::normalise_aggregations(TextureObject<half4> &norms, TextureObject<float4> &cents, TextureObject<float> &contribs, cudaStream_t stream) {
     const dim3 gridSize((norms.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norms.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
     const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
diff --git a/components/operators/src/mls.cu b/components/operators/src/mls.cu
index 5d25fc6c0e8648318a242ff08fcec26648086f07..afe15d5592bc396da1abc5703c9f8c0a96b00d49 100644
--- a/components/operators/src/mls.cu
+++ b/components/operators/src/mls.cu
@@ -14,8 +14,8 @@ using ftl::cuda::TextureObject;
  */
  template <int SEARCH_RADIUS>
  __global__ void mls_smooth_kernel(
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
+		TextureObject<half4> normals_in,
+		TextureObject<half4> normals_out,
         TextureObject<float> depth_in,        // Virtual depth map
 		TextureObject<float> depth_out,   // Accumulated output
 		float smoothing,
@@ -69,12 +69,12 @@ using ftl::cuda::TextureObject;
     //    depth_out(screen.x,screen.y) = X.z;
 	//}
 	depth_out(x,y) = X.z;
-	normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+	normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
 }
 
 void ftl::cuda::mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		float smoothing,
@@ -112,8 +112,8 @@ void ftl::cuda::mls_smooth(
  */
  template <int SEARCH_RADIUS>
  __global__ void colour_mls_smooth_kernel(
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
+		TextureObject<half4> normals_in,
+		TextureObject<half4> normals_out,
         TextureObject<float> depth_in,        // Virtual depth map
 		TextureObject<float> depth_out,   // Accumulated output
 		TextureObject<uchar4> colour_in,
@@ -179,12 +179,12 @@ void ftl::cuda::mls_smooth(
     //    depth_out(screen.x,screen.y) = X.z;
 	//}
 	depth_out(x,y) = X.z;
-	normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+	normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
 }
 
 void ftl::cuda::colour_mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -233,8 +233,8 @@ __device__ inline int segmentID(int u, int v) {
  template <bool FILLING, int RADIUS>
  __global__ void colour_mls_smooth_csr_kernel(
 	 	TextureObject<uchar4> region,
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
+		TextureObject<half4> normals_in,
+		TextureObject<half4> normals_out,
         TextureObject<float> depth_in,        // Virtual depth map
 		TextureObject<float> depth_out,   // Accumulated output
 		TextureObject<uchar4> colour_in,
@@ -325,14 +325,14 @@ __device__ inline int segmentID(int u, int v) {
 		//    depth_out(screen.x,screen.y) = X.z;
 		//}
 		depth_out(x,y) = X.z;
-		normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+		normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
 	}
 }
 
 void ftl::cuda::colour_mls_smooth_csr(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -362,6 +362,24 @@ void ftl::cuda::colour_mls_smooth_csr(
 
 // ===== Cross Aggregate MLS ===================================================
 
+__device__ inline float colourAsFloat(uchar4 c) {
+	union {
+		uchar4 col;
+		float f;
+	};
+	col = c;
+	return f;
+}
+
+__device__ inline uchar4 floatAsColour(float pf) {
+	union {
+		uchar4 col;
+		float f;
+	};
+	f = pf;
+	return col;
+}
+
 /*
  * Smooth depth map using Moving Least Squares. This version uses colour
  * similarity weights to adjust the spatial smoothing factor. It also uses
@@ -371,12 +389,16 @@ void ftl::cuda::colour_mls_smooth_csr(
  */
  template <int RADIUS>
  __global__ void mls_aggr_horiz_kernel(
-	 	TextureObject<uchar4> region,
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
-        TextureObject<float> depth_in,        // Virtual depth map
+		const uchar4* __restrict__ region,
+		size_t region_pitch, 
+		const half4* __restrict__ normals_in,
+		size_t normals_in_pitch,
+		TextureObject<half4> normals_out,
+		const float* __restrict__ depth_in,        // Virtual depth map
+		size_t depth_in_pitch,
 		TextureObject<float4> centroid_out,   // Accumulated output
-		TextureObject<uchar4> colour_in,
+		const uchar4* __restrict__ colour_in,
+		size_t colour_in_pitch,
 		float smoothing,
 		float colour_smoothing,
         ftl::rgbd::Camera camera) {
@@ -384,59 +406,65 @@ void ftl::cuda::colour_mls_smooth_csr(
     const int x = blockIdx.x*blockDim.x + threadIdx.x;
     const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
+    if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
 
-	float3 aX = make_float3(0.0f,0.0f,0.0f);
-	float3 nX = make_float3(0.0f,0.0f,0.0f);
-    float contrib = 0.0f;
+		float3 aX = make_float3(0.0f,0.0f,0.0f);
+		float3 nX = make_float3(0.0f,0.0f,0.0f);
+		float contrib = 0.0f;
 
-	float d0 = depth_in.tex2D(x, y);
+		const float d0 = depth_in[y*depth_in_pitch+x];
+		const uchar4 c0 = colour_in[y*colour_in_pitch+x];
 
-	// Note: x and y flipped as output is rotated.
-	centroid_out(y,x) = make_float4(0.0f);
-	normals_out(y,x) = normals_in(x,y);
+		// Note: x and y flipped as output is rotated.
+		centroid_out(y,x) = make_float4(0.0f,0.0f,0.0f,colourAsFloat(c0));
+		normals_out(y,x) = normals_in[y*normals_in_pitch+x];
 
-	if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return;
-	
-	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
-	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
+		if (d0 > camera.minDepth && d0 < camera.maxDepth) {
+		
+			float3 X = camera.screenToCam((int)(x),(int)(y),d0);
 
-    // Cross-Support Neighbourhood
-	uchar4 base = region.tex2D(x,y);
+			// Cross-Support Neighbourhood
+			const uchar4 base = region[y*region_pitch+x];
 
-	#pragma unroll
-	for (int u=-RADIUS; u<=RADIUS; ++u) {
-		const float d = depth_in.tex2D(x+u, y);
+			const half4* __restrict__ nptr = normals_in + y*normals_in_pitch + x - RADIUS;
+			const float* __restrict__ dptr = depth_in + y*depth_in_pitch + x - RADIUS;
+			const uchar4* __restrict__ cptr = colour_in + y*colour_in_pitch + x - RADIUS;
 
-		// If outside of cross support range, set weight to 0 to ignore
-		float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f;
+			#pragma unroll
+			for (int u=-RADIUS; u<=RADIUS; ++u) {
+				const float d = *(dptr++);
 
-		// Point and normal of neighbour
-		const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y),d);
-		const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)));
+				// If outside of cross support range, set weight to 0 to ignore
+				float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f;
 
-		// Bad or missing normals should be ignored
-		if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
+				// Point and normal of neighbour
+				const float3 Xi = camera.screenToCam(x+u,y,d);
+				const float3 Ni = make_float3(*(nptr++));
 
-		// Gauss approx colour weighting.
-		const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y) + 0.5f);
-		w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
+				// Bad or missing normals should be ignored
+				if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
 
-		// Gauss approx weighting function using point distance
-		w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w);
+				// Gauss approx colour weighting.
+				const uchar4 c = *(cptr++);
+				w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
 
-		aX += Xi*w;
-		nX += Ni*w;
-		contrib += w;
-	}
+				// Gauss approx weighting function using point distance
+				w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w);
 
-	if (contrib > 0.0f) {
-		nX /= contrib;  // Weighted average normal
-		aX /= contrib;  // Weighted average point (centroid)
+				aX += Xi*w;
+				nX += Ni*w;
+				contrib += w;
+			}
 
-		// Note: x and y flipped since output is rotated 90 degrees.
-		centroid_out(y,x) = make_float4(aX, 0.0f);
-		normals_out(y,x) = make_float4(nX / length(nX), 0.0f);
+			if (contrib > 0.0f) {
+				nX /= contrib;  // Weighted average normal
+				aX /= contrib;  // Weighted average point (centroid)
+
+				// Note: x and y flipped since output is rotated 90 degrees.
+				centroid_out(y,x) = make_float4(aX, colourAsFloat(c0));
+				normals_out(y,x) = make_half4(nX / length(nX), 0.0f);
+			}
+		}
 	}
 }
 
@@ -446,13 +474,14 @@ void ftl::cuda::colour_mls_smooth_csr(
  */
 template <int RADIUS>
  __global__ void mls_aggr_vert_kernel(
-	 	TextureObject<uchar4> region,
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
-        TextureObject<float4> centroid_in,        // Virtual depth map
+		const uchar4* __restrict__ region,
+		size_t region_pitch, 
+		const half4* __restrict__ normals_in,
+		size_t normals_in_pitch,
+		TextureObject<half4> normals_out,
+		const float4* __restrict__ centroid_in,        // Virtual depth map
+		size_t centroid_in_pitch,
 		TextureObject<float4> centroid_out,   // Accumulated output
-		TextureObject<uchar4> colour_in,
-		TextureObject<float> depth_in,
 		float smoothing,
 		float colour_smoothing,
         ftl::rgbd::Camera camera) {
@@ -460,60 +489,65 @@ template <int RADIUS>
     const int x = blockIdx.x*blockDim.x + threadIdx.x;
     const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
-
-	float3 aX = make_float3(0.0f,0.0f,0.0f);
-	float3 nX = make_float3(0.0f,0.0f,0.0f);
-	float contrib = 0.0f;
-	
-	float d0 = depth_in.tex2D(x, y);
-	if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return;
-	
-	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
-
-	centroid_out(x,y) = make_float4(0.0f);
-	normals_out(x,y) = make_float4(0.0f);
-	
-	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
+    if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
+		float3 aX = make_float3(0.0f,0.0f,0.0f);
+		float3 nX = make_float3(0.0f,0.0f,0.0f);
+		float contrib = 0.0f;
+		
+		const float4 cin = centroid_in[x*centroid_in_pitch+y];
+		const float3 A = make_float3(cin);
+		const uchar4 c0 = floatAsColour(cin.w);
 
-    // Cross-Support Neighbourhood
-	uchar4 base = region.tex2D(x,y);
+		centroid_out(x,y) = make_float4(0.0f);
+		normals_out(x,y) = make_half4(0.0f);
+		
+		if (A.z > camera.minDepth && A.z < camera.maxDepth) {
+			//float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+			
+			//const uchar4 c0 = colour_in[y*colour_in_pitch+x];
 
-	#pragma unroll
-	for (int v=-RADIUS; v<=RADIUS; ++v) {
-		const float d = depth_in.tex2D(x, y+v);
-		const float3 Xi = camera.screenToCam(x,y+v,d);
+			// Cross-Support Neighbourhood
+			const uchar4 base = region[y*region_pitch+x];
 
-		// Note: x and y flipped, input image is rotated.
-		float3 Ai = make_float3(centroid_in.tex2D(y+v, x));
+			const half4* __restrict__ nptr = normals_in + x*normals_in_pitch + y - RADIUS;
+			const float4* __restrict__ ceptr = (centroid_in + x*centroid_in_pitch + y - RADIUS);
 
-		// If outside the cross support range, set weight to 0 to ignore
-		float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f;
+			#pragma unroll
+			for (int v=-RADIUS; v<=RADIUS; ++v) {
+				// Note: x and y flipped, input image is rotated.
+				const float4 cin = *(ceptr++);
+				const float3 Ai = make_float3(cin);
+				const uchar4 c = floatAsColour(cin.w);
 
-		// Note: x and y flipped, input image is rotated.
-		const float3 Ni = make_float3(normals_in.tex2D(y+v, x));
+				// If outside the cross support range, set weight to 0 to ignore
+				float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f;
 
-		// Bad normals should be ignored.
-		if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
+				// Note: x and y flipped, input image is rotated.
+				const float3 Ni = make_float3(*(nptr++));
 
-		// Gauss approx colour weighting.
-		const float4 c = colour_in.tex2D(float(x) + 0.5f, float(y+v) + 0.5f);
-		w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
+				// Bad normals should be ignored.
+				if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
 
-		// Gauss approx weighting function using point distance
-		w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w);
+				// Gauss approx colour weighting.
+				//const uchar4 c = *(cptr+v*colour_in_pitch);
+				w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
 
-		aX += Ai*w;
-		nX += Ni*w;
-		contrib += w;
-	}
+				// Gauss approx weighting function using point distance
+				w = ftl::cuda::spatialWeighting(A,Ai,A.z*smoothing*w);
 
-	// Normalise the summed points and normals
-	if (contrib > 0.0f) {
-		nX /= contrib;  // Weighted average normal
-		aX /= contrib;  // Weighted average point (centroid)
-		centroid_out(x,y) = make_float4(aX, 0.0f);
-		normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+				aX += Ai*w;
+				nX += Ni*w;
+				contrib += w;
+			}
+
+			// Normalise the summed points and normals
+			if (contrib > 0.0f) {
+				nX /= contrib;  // Weighted average normal
+				aX /= contrib;  // Weighted average point (centroid)
+				centroid_out(x,y) = make_float4(aX, 0.0f);
+				normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
+			}
+		}
 	}
 }
 
@@ -522,41 +556,40 @@ template <int RADIUS>
  * field and move the depth value accordingly using the calculated normal.
  */
 __global__ void mls_adjust_depth_kernel(
-		TextureObject<float4> normals_in,
+		TextureObject<half4> normals_in,
 		TextureObject<float4> centroid_in,        // Virtual depth map
-		TextureObject<float> depth_in,
 		TextureObject<float> depth_out,
 		ftl::rgbd::Camera camera) {
 	
 	const int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;
 
-	if (x < 0 || y < 0 || x >= depth_out.width() || y >= depth_out.height()) return;
+	if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) {
+		const float3 aX = make_float3(centroid_in(x,y));
+		const float3 nX = make_float3(normals_in(x,y));
 
-	float3 aX = make_float3(centroid_in(x,y));
-	float3 nX = make_float3(normals_in(x,y));
+		//float d0 = depth_in.tex2D(x, y);
+		depth_out(x,y) = aX.z;
 
-	float d0 = depth_in.tex2D(x, y);
-	depth_out(x,y) = d0;
-
-	if (d0 > camera.minDepth && d0 < camera.maxDepth && aX.z > camera.minDepth && aX.z < camera.maxDepth) {
-		float3 X = camera.screenToCam((int)(x),(int)(y),d0);
+		if (aX.z > camera.minDepth && aX.z < camera.maxDepth) {
+			float3 X = camera.screenToCam((int)(x),(int)(y),aX.z);
 
-		// Signed-Distance Field function
-		float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
+			// Signed-Distance Field function
+			float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
 
-		// Calculate new point using SDF function to adjust depth (and position)
-		X = X - nX * fX;
-		
-		depth_out(x,y) = X.z;
+			// Calculate new point using SDF function to adjust depth (and position)
+			X = X - nX * fX;
+			
+			depth_out(x,y) = X.z;
+		}
 	}
 }
 
 
 void ftl::cuda::mls_aggr_horiz(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float4> &centroid_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -566,17 +599,20 @@ void ftl::cuda::mls_aggr_horiz(
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream) {
 
-	const dim3 gridSize((normals_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	static constexpr int THREADS_X = 16;
+	static constexpr int THREADS_Y = 16;
+
+	const dim3 gridSize((normals_in.width() + THREADS_X - 1)/THREADS_X, (normals_in.height() + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
 
 	switch(radius) {
-	case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
-	case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break;
+	case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
+	case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
 	default: return;
 	}
 	cudaSafeCall( cudaGetLastError() );
@@ -589,29 +625,30 @@ void ftl::cuda::mls_aggr_horiz(
 
 void ftl::cuda::mls_aggr_vert(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float4> &centroid_in,
 		ftl::cuda::TextureObject<float4> &centroid_out,
-		ftl::cuda::TextureObject<uchar4> &colour_in,
-		ftl::cuda::TextureObject<float> &depth_in,
 		float smoothing,
 		float colour_smoothing,
 		int radius,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream) {
 
-	const dim3 gridSize((normals_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	static constexpr int THREADS_X = 4;
+	static constexpr int THREADS_Y = 32;
+
+	const dim3 gridSize((normals_out.width() + THREADS_X - 1)/THREADS_X, (normals_out.height() + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
 
 	switch(radius) {
-	case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
-	case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break;
+	case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
+	case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
 	default: return;
 	}
 	cudaSafeCall( cudaGetLastError() );
@@ -623,17 +660,19 @@ void ftl::cuda::mls_aggr_vert(
 }
 
 void ftl::cuda::mls_adjust_depth(
-		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_in,
 		ftl::cuda::TextureObject<float4> &centroid_in,
-		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream) {
 
-	const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	static constexpr int THREADS_X = 32;
+	static constexpr int THREADS_Y = 4;
+
+	const dim3 gridSize((depth_out.width() + THREADS_X - 1)/THREADS_X, (depth_out.height() + THREADS_Y - 1)/THREADS_Y);
+	const dim3 blockSize(THREADS_X, THREADS_Y);
 
-	mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_in, depth_out, camera);
+	mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_out, camera);
 	cudaSafeCall( cudaGetLastError() );
 
 
@@ -654,8 +693,8 @@ void ftl::cuda::mls_adjust_depth(
  */
  template <int SEARCH_RADIUS>
  __global__ void adaptive_mls_smooth_kernel(
-		TextureObject<float4> normals_in,
-		TextureObject<float4> normals_out,
+		TextureObject<half4> normals_in,
+		TextureObject<half4> normals_out,
         TextureObject<float> depth_in,        // Virtual depth map
 		TextureObject<float> depth_out,   // Accumulated output
 		TextureObject<float> smoothing,
@@ -717,12 +756,12 @@ void ftl::cuda::mls_adjust_depth(
     //    depth_out(screen.x,screen.y) = X.z;
 	//}
 	depth_out(x,y) = X.z;
-	normals_out(x,y) = make_float4(nX / length(nX), 0.0f);
+	normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
 }
 
 void ftl::cuda::adaptive_mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<float> &smoothing,
diff --git a/components/operators/src/mvmls.cpp b/components/operators/src/mvmls.cpp
index ab150ed459862a453f5dce98660885a65b3e0f0f..8a74b7a95f92103e55d5bce371ae38886353c1f2 100644
--- a/components/operators/src/mvmls.cpp
+++ b/components/operators/src/mvmls.cpp
@@ -48,7 +48,7 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 
     // Make sure we have enough buffers
     while (normals_horiz_.size() < in.frames.size()) {
-        normals_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width));
+        normals_horiz_.push_back(new ftl::cuda::TextureObject<half4>(size.height, size.width));
         centroid_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width));
         centroid_vert_.push_back(new ftl::cuda::TextureObject<float4>(size.width, size.height));
         contributions_.push_back(new ftl::cuda::TextureObject<float>(size.width, size.height));
@@ -221,7 +221,7 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 
             ftl::cuda::mls_aggr_horiz(
                 f.createTexture<uchar4>(Channel::Support2),
-                f.createTexture<float4>(Channel::Normals),
+                f.createTexture<half4>(Channel::Normals),
                 *normals_horiz_[i],
                 f.createTexture<float>(Channel::Depth),
                 *centroid_horiz_[i],
@@ -236,11 +236,9 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
             ftl::cuda::mls_aggr_vert(
                 f.getTexture<uchar4>(Channel::Support2),
                 *normals_horiz_[i],
-                f.getTexture<float4>(Channel::Normals),
+                f.getTexture<half4>(Channel::Normals),
                 *centroid_horiz_[i],
                 *centroid_vert_[i],
-                f.getTexture<uchar4>(Channel::Colour),
-                f.getTexture<float>(Channel::Depth),
                 thresh,
                 col_smooth,
                 radius,
@@ -288,8 +286,8 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 
                     // For the corresponding points, combine normals and centroids
                     ftl::cuda::aggregate_sources(
-                        f1.getTexture<float4>(Channel::Normals),
-                        f2.getTexture<float4>(Channel::Normals),
+                        f1.getTexture<half4>(Channel::Normals),
+                        f2.getTexture<half4>(Channel::Normals),
                         *centroid_vert_[i],
                         *centroid_vert_[j],
 						f1.getTexture<float>(Channel::Depth),
@@ -324,9 +322,8 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
             }*/
 
             ftl::cuda::mls_adjust_depth(
-                f.getTexture<float4>(Channel::Normals),
+                f.getTexture<half4>(Channel::Normals),
                 *centroid_vert_[i],
-                f.getTexture<float>(Channel::Depth),
                 f.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)),
                 f.getLeftCamera(),
                 stream
diff --git a/components/operators/src/mvmls_cuda.hpp b/components/operators/src/mvmls_cuda.hpp
index 1189ca758a4c2765b1a78b2a9f728303ee2b1384..49dd0c1db15e45606f1fe791879c05cf75dfe1f1 100644
--- a/components/operators/src/mvmls_cuda.hpp
+++ b/components/operators/src/mvmls_cuda.hpp
@@ -53,8 +53,8 @@ void zero_confidence(
     cudaStream_t stream);*/
 
 void aggregate_sources(
-		ftl::cuda::TextureObject<float4> &n1,
-		ftl::cuda::TextureObject<float4> &n2,
+		ftl::cuda::TextureObject<half4> &n1,
+		ftl::cuda::TextureObject<half4> &n2,
 		ftl::cuda::TextureObject<float4> &c1,
 		ftl::cuda::TextureObject<float4> &c2,
 		ftl::cuda::TextureObject<float> &depth1,
@@ -66,8 +66,8 @@ void aggregate_sources(
 		cudaStream_t stream);
 
 void best_sources(
-        ftl::cuda::TextureObject<float4> &normals1,
-        ftl::cuda::TextureObject<float4> &normals2,
+        ftl::cuda::TextureObject<half4> &normals1,
+        ftl::cuda::TextureObject<half4> &normals2,
         ftl::cuda::TextureObject<uchar4> &support1,
         ftl::cuda::TextureObject<uchar4> &suppor2,
         ftl::cuda::TextureObject<float> &depth1,
@@ -88,7 +88,7 @@ void vis_best_sources(
         cudaStream_t stream);
 
 void normalise_aggregations(
-    ftl::cuda::TextureObject<float4> &norms,
+    ftl::cuda::TextureObject<half4> &norms,
     ftl::cuda::TextureObject<float4> &cents,
     ftl::cuda::TextureObject<float> &contribs,
     cudaStream_t stream);
diff --git a/components/operators/src/normals.cpp b/components/operators/src/normals.cpp
index 3a4e7f5dbe5b1b02ca16ba5e7aa9d21c0b63dd5c..aefd04623e7ef9a6f89c6cbaa7d3b3984542612e 100644
--- a/components/operators/src/normals.cpp
+++ b/components/operators/src/normals.cpp
@@ -27,7 +27,7 @@ bool Normals::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t st
 	}
 
 	ftl::cuda::normals(
-		out.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+		out.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 		in.createTexture<float>(Channel::Depth),
 		in.getLeftCamera(), stream
 	);
@@ -93,7 +93,7 @@ bool SmoothNormals::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStrea
 	temp_.create(depth.size());
 
     ftl::cuda::normals(
-		out.createTexture<float4>(Channel::Normals, Format<float4>(depth.size())),
+		out.createTexture<half4>(Channel::Normals, Format<half4>(depth.size())),
 		temp_,
 		in.createTexture<float>(Channel::Depth),
 		radius, smoothing,
diff --git a/components/operators/src/smoothing.cpp b/components/operators/src/smoothing.cpp
index c558637085972bdb12f84697131e9617f0e20a9c..49ca3f4b9e76b8f7bebbd28b6b4dfa8fa1b64bb6 100644
--- a/components/operators/src/smoothing.cpp
+++ b/components/operators/src/smoothing.cpp
@@ -159,8 +159,8 @@ bool SimpleMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t
 	// FIXME: Assume in and out are the same frame.
 	for (int i=0; i<iters; ++i) {
 		ftl::cuda::mls_smooth(
-			in.createTexture<float4>(Channel::Normals),
-			temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			in.createTexture<half4>(Channel::Normals),
+			temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 			in.createTexture<float>(Channel::Depth),
 			temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 			thresh,
@@ -204,8 +204,8 @@ bool ColourMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t
 	for (int i=0; i<iters; ++i) {
 		if (!crosssup) {
 			ftl::cuda::colour_mls_smooth(
-				in.createTexture<float4>(Channel::Normals),
-				temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+				in.createTexture<half4>(Channel::Normals),
+				temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<float>(Channel::Depth),
 				temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<uchar4>(Channel::Colour),
@@ -218,8 +218,8 @@ bool ColourMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t
 		} else {
 			ftl::cuda::colour_mls_smooth_csr(
 				in.createTexture<uchar4>(Channel::Support1),
-				in.createTexture<float4>(Channel::Normals),
-				temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+				in.createTexture<half4>(Channel::Normals),
+				temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<float>(Channel::Depth),
 				temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<uchar4>(Channel::Colour),
@@ -278,7 +278,7 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s
 		if (aggre) {
 			ftl::cuda::mls_aggr_horiz(
 				in.createTexture<uchar4>(Channel::Support1),
-				in.createTexture<float4>(Channel::Normals),
+				in.createTexture<half4>(Channel::Normals),
 				normals_horiz_,
 				in.createTexture<float>(Channel::Depth),
 				centroid_horiz_,
@@ -293,11 +293,9 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s
 			ftl::cuda::mls_aggr_vert(
 				in.createTexture<uchar4>(Channel::Support1),
 				normals_horiz_,
-				in.createTexture<float4>(Channel::Normals),
+				in.createTexture<half4>(Channel::Normals),
 				centroid_horiz_,
 				centroid_vert_,
-				in.createTexture<uchar4>(Channel::Colour),
-				in.createTexture<float>(Channel::Depth),
 				thresh,
 				col_smooth,
 				radius,
@@ -306,9 +304,8 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s
 			);
 
 			ftl::cuda::mls_adjust_depth(
-				in.createTexture<float4>(Channel::Normals),
+				in.createTexture<half4>(Channel::Normals),
 				centroid_vert_,
-				in.createTexture<float>(Channel::Depth),
 				temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(size)),
 				in.getLeftCamera(),
 				stream
@@ -321,8 +318,8 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s
 		} else {
 			ftl::cuda::colour_mls_smooth_csr(
 				in.createTexture<uchar4>(Channel::Support1),
-				in.createTexture<float4>(Channel::Normals),
-				temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+				in.createTexture<half4>(Channel::Normals),
+				temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<float>(Channel::Depth),
 				temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 				in.createTexture<uchar4>(Channel::Colour),
@@ -363,8 +360,8 @@ bool AdaptiveMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_
 	// FIXME: Assume in and out are the same frame.
 	for (int i=0; i<iters; ++i) {
 		ftl::cuda::adaptive_mls_smooth(
-			in.createTexture<float4>(Channel::Normals),
-			temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
+			in.createTexture<half4>(Channel::Normals),
+			temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 			in.createTexture<float>(Channel::Depth),
 			temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())),
 			in.createTexture<float>(Channel::Smoothing),
diff --git a/components/operators/src/smoothing_cuda.hpp b/components/operators/src/smoothing_cuda.hpp
index 6681c1a2a0383fdf69c7a62ab0b78346fede5f3a..06c6d713f81ab41e14695bcad21943be7d71e3d4 100644
--- a/components/operators/src/smoothing_cuda.hpp
+++ b/components/operators/src/smoothing_cuda.hpp
@@ -8,8 +8,8 @@ namespace ftl {
 namespace cuda {
 
 void mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		float smoothing,
@@ -18,8 +18,8 @@ void mls_smooth(
 		cudaStream_t stream);
 
 void colour_mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -31,8 +31,8 @@ void colour_mls_smooth(
 
 void colour_mls_smooth_csr(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -43,17 +43,16 @@ void colour_mls_smooth_csr(
 		cudaStream_t stream);
 
 void mls_adjust_depth(
-		ftl::cuda::TextureObject<float4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_in,
 		ftl::cuda::TextureObject<float4> &centroid_in,
-		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		const ftl::rgbd::Camera &camera,
 		cudaStream_t stream);
 
 void mls_aggr_horiz(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float4> &centroid_out,
 		ftl::cuda::TextureObject<uchar4> &colour_in,
@@ -65,12 +64,10 @@ void mls_aggr_horiz(
 
 void mls_aggr_vert(
 		ftl::cuda::TextureObject<uchar4> &region,
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float4> &centroid_in,
 		ftl::cuda::TextureObject<float4> &centroid_out,
-		ftl::cuda::TextureObject<uchar4> &colour_in,
-		ftl::cuda::TextureObject<float> &depth_in,
 		float smoothing,
 		float colour_smoothing,
 		int radius,
@@ -78,8 +75,8 @@ void mls_aggr_vert(
 		cudaStream_t stream);
 
 void adaptive_mls_smooth(
-		ftl::cuda::TextureObject<float4> &normals_in,
-		ftl::cuda::TextureObject<float4> &normals_out,
+		ftl::cuda::TextureObject<half4> &normals_in,
+		ftl::cuda::TextureObject<half4> &normals_out,
 		ftl::cuda::TextureObject<float> &depth_in,
 		ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<float> &smoothing,
diff --git a/components/renderers/cpp/include/ftl/cuda/normals.hpp b/components/renderers/cpp/include/ftl/cuda/normals.hpp
index da07582d3155485f32c13d34b38c65d88ed4da73..0ac3d3713b1cafc7710094e1f74231831ab19959 100644
--- a/components/renderers/cpp/include/ftl/cuda/normals.hpp
+++ b/components/renderers/cpp/include/ftl/cuda/normals.hpp
@@ -8,7 +8,7 @@
 namespace ftl {
 namespace cuda {
 
-void normals(ftl::cuda::TextureObject<float4> &output,
+/*void normals(ftl::cuda::TextureObject<float4> &output,
         ftl::cuda::TextureObject<float4> &temp,
         ftl::cuda::TextureObject<float4> &input,
 		int radius,
@@ -22,17 +22,17 @@ void normals(ftl::cuda::TextureObject<float4> &output,
 		int radius,
 		float smoothing,
         const ftl::rgbd::Camera &camera,
-        const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream);
+        const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream);*/
 
-void normals(ftl::cuda::TextureObject<float4> &output,
-        ftl::cuda::TextureObject<float4> &temp,
+void normals(ftl::cuda::TextureObject<half4> &output,
+        ftl::cuda::TextureObject<half4> &temp,
         ftl::cuda::TextureObject<float> &input,
 		int radius,
 		float smoothing,
         const ftl::rgbd::Camera &camera,
         const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream);
 
-void normals(ftl::cuda::TextureObject<float4> &output,
+void normals(ftl::cuda::TextureObject<half4> &output,
         ftl::cuda::TextureObject<float> &input,  // Integer depth values
         const ftl::rgbd::Camera &camera,
         cudaStream_t stream);
@@ -42,22 +42,22 @@ void normals_dot(ftl::cuda::TextureObject<float> &output,
         const ftl::rgbd::Camera &camera,
         cudaStream_t stream);
 
-void normal_visualise(ftl::cuda::TextureObject<float4> &norm,
+void normal_visualise(ftl::cuda::TextureObject<half4> &norm,
         ftl::cuda::TextureObject<uchar4> &output,
         const float3 &light, const uchar4 &diffuse, const uchar4 &ambient,
         cudaStream_t stream);
 
-void cool_blue(ftl::cuda::TextureObject<float4> &norm,
+void cool_blue(ftl::cuda::TextureObject<half4> &norm,
         ftl::cuda::TextureObject<uchar4> &output,
         const uchar4 &colouring, const float3x3 &pose,
         cudaStream_t stream);
 
-void normal_filter(ftl::cuda::TextureObject<float4> &norm,
+/*void normal_filter(ftl::cuda::TextureObject<half4> &norm,
         ftl::cuda::TextureObject<float4> &points,
         const ftl::rgbd::Camera &camera, const float4x4 &pose,
-        float thresh, cudaStream_t stream);
+        float thresh, cudaStream_t stream);*/
 
-void transform_normals(ftl::cuda::TextureObject<float4> &norm,
+void transform_normals(ftl::cuda::TextureObject<half4> &norm,
         const float3x3 &pose,
         cudaStream_t stream);
 
diff --git a/components/renderers/cpp/include/ftl/cuda/weighting.hpp b/components/renderers/cpp/include/ftl/cuda/weighting.hpp
index 61db736d316494b9c20262eff080aacec710ec26..fb3429dd5a58da419f8c2188386c22b91c340169 100644
--- a/components/renderers/cpp/include/ftl/cuda/weighting.hpp
+++ b/components/renderers/cpp/include/ftl/cuda/weighting.hpp
@@ -44,6 +44,17 @@ __device__ inline float colourDistance(uchar4 a, uchar4 b) {
 	return ch*ch*ch*ch;
 }
 
+/*
+ * Alternative colour distance measure
+ */
+ __device__ inline float colourWeighting2(uchar4 a, uchar4 b, float h) {
+	const float c = float(max(abs(int(a.x)-int(b.x)), max(abs(int(a.y)-int(b.y)), abs(int(a.z)-int(b.z)))));
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
 /*
  * Colour weighting as suggested in:
  * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
@@ -71,6 +82,11 @@ __device__ inline float colourDistance(uchar4 a, uchar4 b) {
 	return ch*ch*ch*ch;
 }
 
+ __device__ inline float simpleColourWeighting(uchar4 a, uchar4 b, float h) {
+	const float w = max(fabsf(float(a.x)-float(b.x)), max(fabsf(float(a.y)-float(b.y)), fabsf(float(a.z)-float(b.z))));
+	return 1.0f - min(1.0f, w/h);
+}
+
 }
 }
 
diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp
index 38e244d34c2f1c7428c03d2aaae637dfbc603e3e..c346b1809182aa0815e7140a54efbda6397c385b 100644
--- a/components/renderers/cpp/src/CUDARender.cpp
+++ b/components/renderers/cpp/src/CUDARender.cpp
@@ -199,7 +199,7 @@ void CUDARender::__reprojectChannel(ftl::rgbd::Frame &output, ftl::codecs::Chann
 					f.createTexture<float>(Channel::Depth),
 					output.getTexture<float>(Channel::Depth),
 					f.createTexture<short>(Channel::Weights),
-					(output.hasChannel(Channel::Normals)) ? &output.createTexture<float4>(Channel::Normals) : nullptr,
+					(output.hasChannel(Channel::Normals)) ? &output.createTexture<half4>(Channel::Normals) : nullptr,
 					temp_.createTexture<typename AccumSelector<T>::type>(AccumSelector<T>::channel),
 					temp_.getTexture<int>(Channel::Contribution),
 					params_,
@@ -394,8 +394,8 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre
 	//filters_->filter(out, src, stream);
 
 	// Generate normals for final virtual image
-	ftl::cuda::normals(out.createTexture<float4>(Channel::Normals, Format<float4>(params_.camera.width, params_.camera.height)),
-				temp_.createTexture<float4>(Channel::Normals),
+	ftl::cuda::normals(out.createTexture<half4>(Channel::Normals, Format<half4>(params_.camera.width, params_.camera.height)),
+				temp_.createTexture<half4>(Channel::Normals),
 				out.createTexture<float>(Channel::Depth),
 				value("normal_radius", 1), value("normal_smoothing", 0.02f),
 				params_.camera, pose_.getFloat3x3(), poseInverse_.getFloat3x3(), stream_);
@@ -495,7 +495,7 @@ void CUDARender::_allocateChannels(ftl::rgbd::Frame &out) {
 	temp_.create<GpuMat>(Channel::Contribution, Format<int>(camera.width, camera.height));
 	temp_.create<GpuMat>(Channel::Depth, Format<int>(camera.width, camera.height));
 	temp_.create<GpuMat>(Channel::Depth2, Format<int>(camera.width, camera.height));
-	temp_.create<GpuMat>(Channel::Normals, Format<float4>(camera.width, camera.height));
+	temp_.create<GpuMat>(Channel::Normals, Format<half4>(camera.width, camera.height));
 	temp_.create<GpuMat>(Channel::Weights, Format<float>(camera.width, camera.height));
 	temp_.createTexture<int>(Channel::Depth);
 }
@@ -558,7 +558,7 @@ void CUDARender::_postprocessColours(ftl::rgbd::Frame &out) {
 		auto col = parseCUDAColour(value("cool_effect_colour", std::string("#2222ff")));
 
 		ftl::cuda::cool_blue(
-			out.getTexture<float4>(Channel::Normals),
+			out.getTexture<half4>(Channel::Normals),
 			out.getTexture<uchar4>(Channel::Colour),
 			col, pose,
 			stream_	
@@ -599,7 +599,7 @@ void CUDARender::_renderNormals(ftl::rgbd::Frame &out) {
 	// Visualise normals to RGBA
 	out.create<GpuMat>(Channel::ColourNormals, Format<uchar4>(params_.camera.width, params_.camera.height)).setTo(cv::Scalar(0,0,0,0), cvstream);
 
-	ftl::cuda::normal_visualise(out.getTexture<float4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals),
+	ftl::cuda::normal_visualise(out.getTexture<half4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals),
 			light_pos_,
 			light_diffuse_,
 			light_ambient_, stream_);
diff --git a/components/renderers/cpp/src/normals.cu b/components/renderers/cpp/src/normals.cu
index eb2a8fc892562ac275b0d6dd1ab0229b02c34b12..81ee7638b2df2cb3193dacdcfb679564818dc4bb 100644
--- a/components/renderers/cpp/src/normals.cu
+++ b/components/renderers/cpp/src/normals.cu
@@ -4,7 +4,7 @@
 #define T_PER_BLOCK 16
 #define MINF __int_as_float(0xff800000)
 
-__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
+/*__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
         ftl::cuda::TextureObject<float4> input) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -29,13 +29,13 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
 			}
 		}
 	}
-}
+}*/
 
 __device__ inline bool isValid(const ftl::rgbd::Camera &camera, const float3 &d) {
 	return d.z >= camera.minDepth && d.z <= camera.maxDepth;
 }
 
-__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
+/*__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
 		ftl::cuda::TextureObject<int> input, ftl::rgbd::Camera camera, float3x3 pose) {
 	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
 	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -61,16 +61,16 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
 			}
 		}
 	}
-}
+}*/
 
-__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
+__global__ void computeNormals_kernel(ftl::cuda::TextureObject<half4> output,
         ftl::cuda::TextureObject<float> input, ftl::rgbd::Camera camera) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
     const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
 
     if(x >= input.width() || y >= input.height()) return;
 
-    output(x,y) = make_float4(0, 0, 0, 0);
+    output(x,y) = make_half4(0, 0, 0, 0);
 
     if(x > 0 && x < input.width()-1 && y > 0 && y < input.height()-1) {
         const float3 CC = camera.screenToCam(x+0, y+0, input.tex2D((int)x+0, (int)y+0));
@@ -85,13 +85,13 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output,
             const float  l = length(n);
 
             if(l > 0.0f) {
-                output(x,y) = make_float4((n/-l), 1.0f);
+                output(x,y) = make_half4((n/-l), 1.0f);
             }
         }
     }
 }
 
-template <int RADIUS>
+/*template <int RADIUS>
 __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
         ftl::cuda::TextureObject<float4> output,
         ftl::cuda::TextureObject<float4> points,
@@ -134,9 +134,9 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
     nsum /= length(nsum);
 
     output(x,y) = (contrib > 0.0f) ? make_float4(nsum, dot(nsum, ray)) : make_float4(0.0f);
-}
+}*/
 
-template <int RADIUS>
+/*template <int RADIUS>
 __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
         ftl::cuda::TextureObject<float4> output,
         ftl::cuda::TextureObject<int> depth,
@@ -179,9 +179,9 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
     nsum /= length(nsum);
 
     output(x,y) = (contrib > 0.0f) ? make_float4(pose*nsum, 1.0f) : make_float4(0.0f);
-}
+}*/
 
-template <>
+/*template <>
 __global__ void smooth_normals_kernel<0>(ftl::cuda::TextureObject<float4> norms,
         ftl::cuda::TextureObject<float4> output,
         ftl::cuda::TextureObject<int> depth,
@@ -206,11 +206,11 @@ __global__ void smooth_normals_kernel<0>(ftl::cuda::TextureObject<float4> norms,
 
     const float4 n = norms.tex2D((int)x,(int)y);
     output(x,y) = n;
-}
+}*/
 
 template <int RADIUS>
-__global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
-        ftl::cuda::TextureObject<float4> output,
+__global__ void smooth_normals_kernel(ftl::cuda::TextureObject<half4> norms,
+        ftl::cuda::TextureObject<half4> output,
         ftl::cuda::TextureObject<float> depth,
         ftl::rgbd::Camera camera, float3x3 pose, float smoothing) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
@@ -222,7 +222,7 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
     float3 nsum = make_float3(0.0f);
     float contrib = 0.0f;
 
-    output(x,y) = make_float4(0.0f,0.0f,0.0f,0.0f);
+    output(x,y) = make_half4(0.0f,0.0f,0.0f,0.0f);
 
     if (p0.z < camera.minDepth || p0.z > camera.maxDepth) return;
 
@@ -234,7 +234,7 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
             //const float s = 1.0f;
 
             //if (s > 0.0f) {
-                const float4 n = norms.tex2D((int)x+u,(int)y+v);
+                const float4 n = make_float4(norms.tex2D((int)x+u,(int)y+v));
                 if (n.w > 0.0f) {
                     nsum += make_float3(n) * s;
                     contrib += s;
@@ -250,10 +250,10 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms,
     nsum /= contrib;
     nsum /= length(nsum);
 
-    output(x,y) = (contrib > 0.0f) ? make_float4(pose*nsum, 1.0f) : make_float4(0.0f);
+    output(x,y) = (contrib > 0.0f) ? make_half4(pose*nsum, 1.0f) : make_half4(0.0f);
 }
 
-void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
+/*void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
         ftl::cuda::TextureObject<float4> &temp,
 		ftl::cuda::TextureObject<float4> &input,
 		int radius,
@@ -309,10 +309,10 @@ void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
 	cudaSafeCall(cudaDeviceSynchronize());
 	//cutilCheckMsg(__FUNCTION__);
 	#endif
-}
+}*/
 
-void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
-		ftl::cuda::TextureObject<float4> &temp,
+void ftl::cuda::normals(ftl::cuda::TextureObject<half4> &output,
+		ftl::cuda::TextureObject<half4> &temp,
 		ftl::cuda::TextureObject<float> &input,
 		int radius,
 		float smoothing,
@@ -339,7 +339,7 @@ void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
 	#endif
 }
 
-void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output,
+void ftl::cuda::normals(ftl::cuda::TextureObject<half4> &output,
         ftl::cuda::TextureObject<float> &input,
         const ftl::rgbd::Camera &camera,
         cudaStream_t stream) {
@@ -409,7 +409,7 @@ void ftl::cuda::normals_dot(ftl::cuda::TextureObject<float> &output,
 
 //==============================================================================
 
-__global__ void vis_normals_kernel(ftl::cuda::TextureObject<float4> norm,
+__global__ void vis_normals_kernel(ftl::cuda::TextureObject<half4> norm,
         ftl::cuda::TextureObject<uchar4> output,
         float3 direction, uchar4 diffuse, uchar4 ambient) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
@@ -432,7 +432,7 @@ __global__ void vis_normals_kernel(ftl::cuda::TextureObject<float4> norm,
 		min(255.0f, diffuse.z*d + ambient.z), 255);
 }
 
-void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<float4> &norm,
+void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<half4> &norm,
         ftl::cuda::TextureObject<uchar4> &output,
         const float3 &light, const uchar4 &diffuse, const uchar4 &ambient,
         cudaStream_t stream) {
@@ -451,7 +451,7 @@ void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<float4> &norm,
 
 //==============================================================================
 
-__global__ void cool_blue_kernel(ftl::cuda::TextureObject<float4> norm,
+__global__ void cool_blue_kernel(ftl::cuda::TextureObject<half4> norm,
         ftl::cuda::TextureObject<uchar4> output,
         uchar4 colouring, float3x3 pose) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
@@ -476,7 +476,7 @@ __global__ void cool_blue_kernel(ftl::cuda::TextureObject<float4> norm,
         min(255.0f, colouring.z*d + original.z), 255);
 }
 
-void ftl::cuda::cool_blue(ftl::cuda::TextureObject<float4> &norm,
+void ftl::cuda::cool_blue(ftl::cuda::TextureObject<half4> &norm,
         ftl::cuda::TextureObject<uchar4> &output,
         const uchar4 &colouring, const float3x3 &pose,
         cudaStream_t stream) {
@@ -495,7 +495,7 @@ void ftl::cuda::cool_blue(ftl::cuda::TextureObject<float4> &norm,
 
 //==============================================================================
 
-__global__ void filter_normals_kernel(ftl::cuda::TextureObject<float4> norm,
+/*__global__ void filter_normals_kernel(ftl::cuda::TextureObject<float4> norm,
         ftl::cuda::TextureObject<float4> output,
         ftl::rgbd::Camera camera, float4x4 pose, float thresh) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
@@ -535,11 +535,11 @@ void ftl::cuda::normal_filter(ftl::cuda::TextureObject<float4> &norm,
     cudaSafeCall(cudaDeviceSynchronize());
     //cutilCheckMsg(__FUNCTION__);
     #endif
-}
+}*/
 
 //==============================================================================
 
-__global__ void transform_normals_kernel(ftl::cuda::TextureObject<float4> norm,
+__global__ void transform_normals_kernel(ftl::cuda::TextureObject<half4> norm,
         float3x3 pose) {
     const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
     const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
@@ -548,10 +548,10 @@ __global__ void transform_normals_kernel(ftl::cuda::TextureObject<float4> norm,
 
     float3 normal = pose * make_float3(norm.tex2D((int)x,(int)y));
     normal /= length(normal);
-    norm(x,y) = make_float4(normal, 0.0f);
+    norm(x,y) = make_half4(normal, 0.0f);
 }
 
-void ftl::cuda::transform_normals(ftl::cuda::TextureObject<float4> &norm,
+void ftl::cuda::transform_normals(ftl::cuda::TextureObject<half4> &norm,
         const float3x3 &pose,
         cudaStream_t stream) {
 
diff --git a/components/renderers/cpp/src/reprojection.cu b/components/renderers/cpp/src/reprojection.cu
index 2012642a80729dec2c7dd88de9003e409dd6560c..ce0745003c5415cdf07e3941c0e092b60f4b65cc 100644
--- a/components/renderers/cpp/src/reprojection.cu
+++ b/components/renderers/cpp/src/reprojection.cu
@@ -130,7 +130,7 @@ __device__ inline auto getInput(TextureObject<A> &in, const float2 &screen, floa
 	return in.tex2D(screen.x*inSX, screen.y*inSY); 
 }
 
-__device__ float weightByNormal(TextureObject<float4> &normals, int x, int y, const float3x3 &transformR, const float2 &screenPos, const ftl::rgbd::Camera &camera) {
+__device__ float weightByNormal(TextureObject<half4> &normals, int x, int y, const float3x3 &transformR, const float2 &screenPos, const ftl::rgbd::Camera &camera) {
 	// Calculate the dot product of surface normal and camera ray
 	const float3 n = transformR * make_float3(normals.tex2D(x, y));
 	float3 ray = camera.screenToCam(screenPos.x, screenPos.y, 1.0f);
@@ -156,7 +156,7 @@ __global__ void reprojection_kernel(
         TextureObject<float> depth_src,
 		TextureObject<float> depth_in,        // Virtual depth map
 		TextureObject<short> weights,
-		TextureObject<float4> normals,
+		TextureObject<half4> normals,
 		TextureObject<B> out,			// Accumulated output
 		TextureObject<int> contrib,
 		Parameters params,
@@ -258,7 +258,7 @@ void ftl::cuda::reproject(
         TextureObject<float> &depth_src,       // Original 3D points
 		TextureObject<float> &depth_in,        // Virtual depth map
 		TextureObject<short> &weights,
-		TextureObject<float4> *normals,
+		TextureObject<half4> *normals,
 		TextureObject<B> &out,   // Accumulated output
 		TextureObject<int> &contrib,
 		const Parameters &params,
@@ -340,7 +340,7 @@ template void ftl::cuda::reproject(
 	ftl::cuda::TextureObject<float> &depth_src,		// Original 3D points
 	ftl::cuda::TextureObject<float> &depth_in,		// Virtual depth map
 	ftl::cuda::TextureObject<short> &weights,
-	ftl::cuda::TextureObject<float4> *normals,
+	ftl::cuda::TextureObject<half4> *normals,
 	ftl::cuda::TextureObject<float4> &out,	// Accumulated output
 	ftl::cuda::TextureObject<int> &contrib,
 	const ftl::render::Parameters &params,
@@ -353,7 +353,7 @@ template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float> &depth_src,		// Original 3D points
 		ftl::cuda::TextureObject<float> &depth_in,		// Virtual depth map
 		ftl::cuda::TextureObject<short> &weights,
-		ftl::cuda::TextureObject<float4> *normals,
+		ftl::cuda::TextureObject<half4> *normals,
 		ftl::cuda::TextureObject<float> &out,	// Accumulated output
 		ftl::cuda::TextureObject<int> &contrib,
 		const ftl::render::Parameters &params,
@@ -366,7 +366,7 @@ template void ftl::cuda::reproject(
 		ftl::cuda::TextureObject<float> &depth_src,		// Original 3D points
 		ftl::cuda::TextureObject<float> &depth_in,		// Virtual depth map
 		ftl::cuda::TextureObject<short> &weights,
-		ftl::cuda::TextureObject<float4> *normals,
+		ftl::cuda::TextureObject<half4> *normals,
 		ftl::cuda::TextureObject<float4> &out,	// Accumulated output
 		ftl::cuda::TextureObject<int> &contrib,
 		const ftl::render::Parameters &params,
diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp
index 285f2e651394913d8b10dba603a04b74090fe52a..da47b6fd3b2ef8ae0809728d27d876338207350f 100644
--- a/components/renderers/cpp/src/splatter_cuda.hpp
+++ b/components/renderers/cpp/src/splatter_cuda.hpp
@@ -48,7 +48,7 @@ namespace cuda {
 		float alpha,
 		cudaStream_t stream);
 	
-	void dibr_merge(
+	/*void dibr_merge(
 		ftl::cuda::TextureObject<float4> &points,
 		ftl::cuda::TextureObject<float4> &normals,
 		ftl::cuda::TextureObject<int> &depth,
@@ -60,7 +60,7 @@ namespace cuda {
 		ftl::cuda::TextureObject<float4> &points,
 		ftl::cuda::TextureObject<int> &depth,
 		ftl::render::Parameters params,
-		cudaStream_t stream);
+		cudaStream_t stream);*/
 
 	void dibr_merge(
 		ftl::cuda::TextureObject<float> &depth,
@@ -77,15 +77,15 @@ namespace cuda {
 		ftl::render::Parameters params,
 		cudaStream_t stream);
 
-	template <typename T>
+	/*template <typename T>
 	void splat(
-        ftl::cuda::TextureObject<float4> &normals,
+        ftl::cuda::TextureObject<half4> &normals,
 		ftl::cuda::TextureObject<float> &density,
 		ftl::cuda::TextureObject<T> &colour_in,
         ftl::cuda::TextureObject<int> &depth_in,        // Virtual depth map
         ftl::cuda::TextureObject<float> &depth_out,
 		ftl::cuda::TextureObject<T> &colour_out,
-        const ftl::render::Parameters &params, cudaStream_t stream);
+        const ftl::render::Parameters &params, cudaStream_t stream);*/
 
 	template <typename A, typename B>
 	void dibr_attribute(
@@ -102,7 +102,7 @@ namespace cuda {
 		ftl::cuda::TextureObject<float> &depth_src,		// Original 3D points
 		ftl::cuda::TextureObject<float> &depth_in,		// Virtual depth map
 		ftl::cuda::TextureObject<short> &weights,
-		ftl::cuda::TextureObject<float4> *normals,
+		ftl::cuda::TextureObject<half4> *normals,
 		ftl::cuda::TextureObject<B> &out,	// Accumulated output
 		ftl::cuda::TextureObject<int> &contrib,
 		const ftl::render::Parameters &params,