From ec7267db063fffa691ef4c27f47511f51cd45132 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Tue, 12 Nov 2019 14:07:54 +0200
Subject: [PATCH] Implements #236 Fast Approximate Anti-Aliasing

---
 applications/gui/src/camera.cpp               |  3 +-
 applications/gui/src/media_panel.cpp          |  2 +-
 .../reconstruct/src/ilw/correspondence.cu     |  8 +-
 applications/reconstruct/src/main.cpp         |  8 +-
 .../codecs/include/ftl/codecs/channels.hpp    |  1 +
 .../common/cpp/include/ftl/cuda_common.hpp    | 33 +++++--
 components/operators/CMakeLists.txt           |  2 +
 .../include/ftl/operators/antialiasing.hpp    | 27 ++++++
 components/operators/src/antialiasing.cpp     | 22 +++++
 components/operators/src/antialiasing.cu      | 87 +++++++++++++++++++
 .../operators/src/antialiasing_cuda.hpp       | 14 +++
 components/operators/src/colours.cpp          |  3 +
 components/operators/src/mls.cu               |  8 +-
 components/operators/src/segmentation.cu      | 25 +++---
 .../cpp/include/ftl/cuda/weighting.hpp        | 14 +++
 components/renderers/cpp/src/reprojection.cu  |  6 +-
 components/renderers/cpp/src/tri_render.cpp   | 15 ++--
 .../rgbd-sources/include/ftl/rgbd/frame.hpp   | 16 ++--
 18 files changed, 251 insertions(+), 43 deletions(-)
 create mode 100644 components/operators/include/ftl/operators/antialiasing.hpp
 create mode 100644 components/operators/src/antialiasing.cpp
 create mode 100644 components/operators/src/antialiasing.cu
 create mode 100644 components/operators/src/antialiasing_cuda.hpp

diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index debb8e73d..ae0d5627a 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -293,6 +293,7 @@ void ftl::gui::Camera::setChannel(Channel c) {
 	case Channel::Flow:
 	case Channel::Confidence:
 	case Channel::Normals:
+	case Channel::ColourNormals:
 	case Channel::Right:
 		src_->setChannel(c);
 		break;
@@ -464,7 +465,7 @@ const GLTexture &ftl::gui::Camera::captureFrame() {
 				break;
 
 		//case Channel::Flow:
-		case Channel::Normals:
+		case Channel::ColourNormals:
 		case Channel::Right:
 				if (im2_.rows == 0 || im2_.type() != CV_8UC3) { break; }
 				texture2_.update(im2_);
diff --git a/applications/gui/src/media_panel.cpp b/applications/gui/src/media_panel.cpp
index b3ea1a6af..800940199 100644
--- a/applications/gui/src/media_panel.cpp
+++ b/applications/gui/src/media_panel.cpp
@@ -192,7 +192,7 @@ MediaPanel::MediaPanel(ftl::gui::Screen *screen) : nanogui::Window(screen, ""),
 	button->setCallback([this]() {
 		ftl::gui::Camera *cam = screen_->activeCamera();
 		if (cam) {
-			cam->setChannel(Channel::Normals);
+			cam->setChannel(Channel::ColourNormals);
 		}
 	});
 
diff --git a/applications/reconstruct/src/ilw/correspondence.cu b/applications/reconstruct/src/ilw/correspondence.cu
index 8fb9d94b7..9fb440ae4 100644
--- a/applications/reconstruct/src/ilw/correspondence.cu
+++ b/applications/reconstruct/src/ilw/correspondence.cu
@@ -65,7 +65,7 @@ __global__ void correspondence_energy_vector_kernel(
 	
 	const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
 
-    const uchar4 colour1 = c1.tex2D(x, y);
+    const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f);
 
 	float bestdepth = 0.0f;
 	float bestweight = 0.0f;
@@ -91,17 +91,17 @@ __global__ void correspondence_energy_vector_kernel(
         // Calculate adjusted depth 3D point in camera 2 space
         const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
         const float3 camPos = rayStart_2 + j * rayStep_2; //pose2 * worldPos;
-        const uint2 screen = cam2.camToScreen<uint2>(camPos);
+        const float2 screen = cam2.camToScreen<float2>(camPos);
 
         if (screen.x >= cam2.width || screen.y >= cam2.height) continue;
 
 		// Generate a depth correspondence value
-		const float depth2 = d2.tex2D((int)screen.x, (int)screen.y);
+		const float depth2 = d2.tex2D(int(screen.x+0.5f), int(screen.y+0.5f));
 		const float dweight = ftl::cuda::weighting(fabs(depth2 - camPos.z), params.spatial_smooth);
 		//const float dweight = ftl::cuda::weighting(fabs(depth_adjust - depth1), 2.0f*params.range);
 		
 		// Generate a colour correspondence value
-		const uchar4 colour2 = c2.tex2D((int)screen.x, (int)screen.y);
+		const auto colour2 = c2.tex2D(screen.x, screen.y);
 		const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
 
 		const float weight = weightFunction<FUNCTION>(params, dweight, cweight);
diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp
index 03c21b142..06111688a 100644
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -36,6 +36,7 @@
 #include <ftl/operators/filling.hpp>
 #include <ftl/operators/segmentation.hpp>
 #include <ftl/operators/mask.hpp>
+#include <ftl/operators/antialiasing.hpp>
 
 #include <ftl/cuda/normals.hpp>
 #include <ftl/registration.hpp>
@@ -236,8 +237,12 @@ static void run(ftl::Configurable *root) {
 	int o = root->value("origin_pose", 0) % sources.size();
 	virt->setPose(sources[o]->getPose());
 
+	auto *renderpipe = ftl::config::create<ftl::operators::Graph>(root, "render_pipe");
+	renderpipe->append<ftl::operators::ColourChannels>("colour");  // Generate interpolation texture...
+	renderpipe->append<ftl::operators::FXAA>("antialiasing"); 
+
 	// Generate virtual camera render when requested by streamer
-	virt->onRender([splat,virt,&scene_B,align](ftl::rgbd::Frame &out) {
+	virt->onRender([splat,virt,&scene_B,align,renderpipe](ftl::rgbd::Frame &out) {
 		//virt->setTimestamp(scene_B.timestamp);
 		// Do we need to convert Lab to BGR?
 		if (align->isLabColour()) {
@@ -247,6 +252,7 @@ static void run(ftl::Configurable *root) {
 			}
 		}
 		splat->render(virt, out);
+		renderpipe->apply(out, out, virt, 0);
 	});
 	stream->add(virt);
 
diff --git a/components/codecs/include/ftl/codecs/channels.hpp b/components/codecs/include/ftl/codecs/channels.hpp
index 3a486cb32..6673275fe 100644
--- a/components/codecs/include/ftl/codecs/channels.hpp
+++ b/components/codecs/include/ftl/codecs/channels.hpp
@@ -31,6 +31,7 @@ enum struct Channel : int {
 	Support1		= 13,	// 8UC4 (currently)
 	Support2		= 14,	// 8UC4 (currently)
     Segmentation	= 15,	// 32S?
+    ColourNormals   = 16,   // 8UC4
 
 	AudioLeft		= 32,
 	AudioRight		= 33,
diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp
index 116e26ec7..3ce2452dc 100644
--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -28,6 +28,22 @@ bool hasCompute(int major, int minor);
 
 int deviceCount();
 
+template <typename T>
+struct Float;
+
+template <> struct Float<float> { typedef float type; };
+template <> struct Float<int> { typedef float type; };
+template <> struct Float<float4> { typedef float4 type; };
+template <> struct Float<uchar4> { typedef float4 type; };
+template <> struct Float<short2> { typedef float2 type; };
+
+template <typename T>
+struct ScaleValue;
+
+template <> struct ScaleValue<uchar4> { static constexpr float value = 255.0f; };
+template <> struct ScaleValue<float> { static constexpr float value = 1.0f; };
+template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; };
+
 /**
  * Represent a CUDA texture object. Instances of this class can be used on both
  * host and device. A texture object base cannot be constructed directly, it
@@ -89,7 +105,7 @@ class TextureObject : public TextureObjectBase {
 	static_assert((16u % sizeof(T)) == 0, "Channel format must be aligned with 16 bytes");
 
 	__host__ __device__ TextureObject() : TextureObjectBase() {};
-	explicit TextureObject(const cv::cuda::GpuMat &d);
+	explicit TextureObject(const cv::cuda::GpuMat &d, bool interpolated=false);
 	explicit TextureObject(const cv::cuda::PtrStepSz<T> &d);
 	TextureObject(T *ptr, int pitch, int width, int height);
 	TextureObject(size_t width, size_t height);
@@ -110,7 +126,8 @@ class TextureObject : public TextureObjectBase {
 
 	#ifdef __CUDACC__
 	__device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); }
-	__device__ inline T tex2D(float u, float v) const { return ::tex2D<T>(texobj_, u, v); }
+	__device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); }
+	__device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; }
 	#endif
 
 	__host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; }
@@ -137,7 +154,7 @@ TextureObject<T> &TextureObject<T>::cast(TextureObjectBase &b) {
  * Create a 2D array texture from an OpenCV GpuMat object.
  */
 template <typename T>
-TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d) {
+TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) {
 	// GpuMat must have correct data type
 	CHECK(d.type() == ftl::traits::OpenCVType<T>::value);
 
@@ -153,7 +170,8 @@ TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d) {
 	cudaTextureDesc texDesc;
 	// cppcheck-suppress memsetClassFloat
 	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.readMode = cudaReadModeElementType;
+	texDesc.readMode = (interpolated) ? cudaReadModeNormalizedFloat : cudaReadModeElementType;
+	if (interpolated) texDesc.filterMode = cudaFilterModeLinear;
 
 	cudaTextureObject_t tex = 0;
 	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
@@ -188,6 +206,7 @@ TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) {
 	// cppcheck-suppress memsetClassFloat
 	memset(&texDesc, 0, sizeof(texDesc));
 	texDesc.readMode = cudaReadModeElementType;
+	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
 
 	cudaTextureObject_t tex = 0;
 	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
@@ -221,9 +240,10 @@ TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
 	// cppcheck-suppress memsetClassFloat
 	memset(&texDesc, 0, sizeof(texDesc));
 	texDesc.readMode = cudaReadModeElementType;
+	//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
 
 	cudaTextureObject_t tex = 0;
-	cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+	cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
 	texobj_ = tex;
 	pitch_ = pitch;
 	pitch2_ = pitch_ / sizeof(T);
@@ -255,7 +275,8 @@ TextureObject<T>::TextureObject(size_t width, size_t height) {
 		// cppcheck-suppress memsetClassFloat
 		memset(&texDesc, 0, sizeof(texDesc));
 		texDesc.readMode = cudaReadModeElementType;
-		cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+		//if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear;
+		cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
 	//}
 
 	texobj_ = tex;
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index 734734256..f177aa591 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -16,6 +16,8 @@ set(OPERSRC
 	src/segmentation.cpp
 	src/mask.cu
 	src/mask.cpp
+	src/antialiasing.cpp
+	src/antialiasing.cu
 )
 
 if (HAVE_OPTFLOW)
diff --git a/components/operators/include/ftl/operators/antialiasing.hpp b/components/operators/include/ftl/operators/antialiasing.hpp
new file mode 100644
index 000000000..302631253
--- /dev/null
+++ b/components/operators/include/ftl/operators/antialiasing.hpp
@@ -0,0 +1,27 @@
+#ifndef _FTL_OPERATORS_ANTIALIASING_HPP_
+#define _FTL_OPERATORS_ANTIALIASING_HPP_
+
+#include <ftl/operators/operator.hpp>
+#include <ftl/cuda_common.hpp>
+
+namespace ftl {
+namespace operators {
+
+/**
+ * Fast Approximate Anti-Aliasing by NVIDIA (2010)
+ */
+class FXAA : public ftl::operators::Operator {
+	public:
+    explicit FXAA(ftl::Configurable*);
+    ~FXAA();
+
+	inline Operator::Type type() const override { return Operator::Type::OneToOne; }
+
+    bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgbd::Source *src, cudaStream_t stream) override;
+
+};
+
+}
+}
+
+#endif  // _FTL_OPERATORS_ANTIALIASING_HPP_
diff --git a/components/operators/src/antialiasing.cpp b/components/operators/src/antialiasing.cpp
new file mode 100644
index 000000000..d3c63ce1b
--- /dev/null
+++ b/components/operators/src/antialiasing.cpp
@@ -0,0 +1,22 @@
+#include <ftl/operators/antialiasing.hpp>
+#include "antialiasing_cuda.hpp"
+
+using ftl::operators::FXAA;
+using ftl::codecs::Channel;
+
+FXAA::FXAA(ftl::Configurable *cfg) : ftl::operators::Operator(cfg) {
+
+}
+
+FXAA::~FXAA() {
+
+}
+
+bool FXAA::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgbd::Source *s, cudaStream_t stream) {
+	ftl::cuda::fxaa(
+		in.getTexture<uchar4>(Channel::Colour),
+		stream
+	);
+
+	return true;
+}
diff --git a/components/operators/src/antialiasing.cu b/components/operators/src/antialiasing.cu
new file mode 100644
index 000000000..f0b64bd89
--- /dev/null
+++ b/components/operators/src/antialiasing.cu
@@ -0,0 +1,87 @@
+#include "antialiasing_cuda.hpp"
+
+#define T_PER_BLOCK 8
+
+__device__ inline uchar4 toChar(const float4 rgba) {
+    return make_uchar4(rgba.x*255.0f, rgba.y*255.0f, rgba.z*255.0f, 255);
+}
+
+__global__ void filter_fxaa2(ftl::cuda::TextureObject<uchar4> data) {
+
+    int x = blockIdx.x*blockDim.x + threadIdx.x;
+    int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if(x >= data.width() || y >= data.height())
+    {
+        return;
+    }
+
+    uchar4 out_color;
+    cudaTextureObject_t texRef = data.cudaTexture();
+
+    const float FXAA_SPAN_MAX = 8.0f;
+    const float FXAA_REDUCE_MUL = 1.0f/8.0f;
+    const float FXAA_REDUCE_MIN = (1.0f/128.0f);
+
+    float u = x + 0.5f;
+    float v = y + 0.5f;
+
+    float4 rgbNW = tex2D<float4>( texRef, u-1.0f,v-1.0f);
+    float4 rgbNE = tex2D<float4>( texRef, u+1.0f,v-1.0f);
+    float4 rgbSW = tex2D<float4>( texRef, u-1.0f,v+1.0f);
+    float4 rgbSE = tex2D<float4>( texRef, u+1.0f,v+1.0f);
+    float4 rgbM = tex2D<float4>( texRef, u,v);
+
+    const float4 luma = make_float4(0.299f, 0.587f, 0.114f,0.0f);
+    float lumaNW = dot(rgbNW, luma);
+    float lumaNE = dot(rgbNE, luma);
+    float lumaSW = dot(rgbSW, luma);
+    float lumaSE = dot(rgbSE, luma);
+    float lumaM = dot( rgbM, luma);
+
+    float lumaMin = min(lumaM, min(min(lumaNW, lumaNE), min(lumaSW, lumaSE)));
+    float lumaMax = max(lumaM, max(max(lumaNW, lumaNE), max(lumaSW, lumaSE)));
+
+    float2 dir;
+    dir.x = -((lumaNW + lumaNE) - (lumaSW + lumaSE));
+    dir.y = ((lumaNW + lumaSW) - (lumaNE + lumaSE));
+
+    float dirReduce = max((lumaNW + lumaNE + lumaSW + lumaSE) * (0.25f * FXAA_REDUCE_MUL), FXAA_REDUCE_MIN);
+
+    float rcpDirMin = 1.0f/(min(abs(dir.x), abs(dir.y)) + dirReduce);
+
+
+    float2 test = dir * rcpDirMin;
+    dir = clamp(test,-FXAA_SPAN_MAX,FXAA_SPAN_MAX);
+
+    float4 rgbA = (1.0f/2.0f) * (
+                tex2D<float4>( texRef,u+ dir.x * (1.0f/3.0f - 0.5f),v+ dir.y * (1.0f/3.0f - 0.5f))+
+                tex2D<float4>( texRef,u+ dir.x * (2.0f/3.0f - 0.5f),v+ dir.y * (2.0f/3.0f - 0.5f)));
+    float4 rgbB = rgbA * (1.0f/2.0f) + (1.0f/4.0f) * (
+                tex2D<float4>( texRef,u+ dir.x * (0.0f/3.0f - 0.5f),v+ dir.y * (0.0f/3.0f - 0.5f))+
+                tex2D<float4>( texRef,u+ dir.x * (3.0f/3.0f - 0.5f),v+ dir.y * (3.0f/3.0f - 0.5f)));
+    float lumaB = dot(rgbB, luma);
+
+    if((lumaB < lumaMin) || (lumaB > lumaMax)){
+        out_color=toChar(rgbA);
+    } else {
+        out_color=toChar(rgbB);
+    }
+
+
+    //surf2Dwrite<uchar4>(out_color, surfaceWrite, x*sizeof(uchar4), y);
+
+    data(x,y) = out_color;
+}
+
+void ftl::cuda::fxaa(ftl::cuda::TextureObject<uchar4> &colour, cudaStream_t stream) {
+	const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    filter_fxaa2<<<gridSize, blockSize, 0, stream>>>(colour);
+	cudaSafeCall( cudaGetLastError() );
+
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+#endif
+}
diff --git a/components/operators/src/antialiasing_cuda.hpp b/components/operators/src/antialiasing_cuda.hpp
new file mode 100644
index 000000000..afe2c5246
--- /dev/null
+++ b/components/operators/src/antialiasing_cuda.hpp
@@ -0,0 +1,14 @@
+#ifndef _FTL_CUDA_ANTIALIASING_HPP_
+#define _FTL_CUDA_ANTIALIASING_HPP_
+
+#include <ftl/cuda_common.hpp>
+
+namespace ftl {
+namespace cuda {
+
+void fxaa(ftl::cuda::TextureObject<uchar4> &colour, cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/components/operators/src/colours.cpp b/components/operators/src/colours.cpp
index 474b783c4..9c6fff8b8 100644
--- a/components/operators/src/colours.cpp
+++ b/components/operators/src/colours.cpp
@@ -24,5 +24,8 @@ bool ColourChannels::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ftl::rgb
 		cv::cuda::cvtColor(temp_,col, cv::COLOR_BGR2BGRA, 0, cvstream);
 	}
 
+	//in.resetTexture(Channel::Colour);
+	in.createTexture<uchar4>(Channel::Colour, true);
+
 	return true;
 }
diff --git a/components/operators/src/mls.cu b/components/operators/src/mls.cu
index d7770f134..70d801f2f 100644
--- a/components/operators/src/mls.cu
+++ b/components/operators/src/mls.cu
@@ -135,7 +135,7 @@ void ftl::cuda::mls_smooth(
 	if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
 	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
 
-	uchar4 c0 = colour_in.tex2D(x, y);
+	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
 
     // Neighbourhood
     for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
@@ -149,7 +149,7 @@ void ftl::cuda::mls_smooth(
 
 		if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
 
-		const uchar4 c = colour_in.tex2D(x+u, y+v);
+		const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
 		const float cw = ftl::cuda::colourWeighting(c0,c,colour_smoothing);
 
 		// Gauss approx weighting function using point distance
@@ -259,7 +259,7 @@ __device__ inline int segmentID(int u, int v) {
 	}
 	float3 X = camera.screenToCam((int)(x),(int)(y),d0);
 
-	uchar4 c0 = colour_in.tex2D(x, y);
+	float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
 
     // Neighbourhood
 	uchar4 base = region.tex2D(x,y);
@@ -281,7 +281,7 @@ __device__ inline int segmentID(int u, int v) {
 
 			if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
 
-			const uchar4 c = colour_in.tex2D(x+u, y+v);
+			const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
 			const float cw = ftl::cuda::colourWeighting(c0,c,colour_smoothing);
 
 			// Allow missing point to borrow z value
diff --git a/components/operators/src/segmentation.cu b/components/operators/src/segmentation.cu
index baeda1023..3bfcbc0f5 100644
--- a/components/operators/src/segmentation.cu
+++ b/components/operators/src/segmentation.cu
@@ -12,6 +12,11 @@ __device__ inline float cross<uchar4>(uchar4 p1, uchar4 p2) {
     return max(max(__sad(p1.x,p2.x,0),__sad(p1.y,p2.y,0)), __sad(p1.z,p2.z,0));
 }
 
+template <>
+__device__ inline float cross<float4>(float4 p1, float4 p2) {
+    return max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z));
+}
+
 template <>
 __device__ inline float cross<float>(float p1, float p2) {
     return fabs(p1-p2);
@@ -26,12 +31,12 @@ __device__ uchar4 calculate_support_region(const TextureObject<T> &img, int x, i
 
 	uchar4 result = make_uchar4(0, 0, 0, 0);
 
-	T colour = img.tex2D(x,y);
-	T prev_colour = colour;
+	auto colour = img.tex2D((float)x+0.5f,(float)y+0.5f);
+	auto prev_colour = colour;
 
 	int u;
     for (u=x-1; u >= x_min; --u) {
-		T next_colour = img.tex2D(u,y);
+		auto next_colour = img.tex2D((float)u+0.5f,(float)y+0.5f);
         if (cross(prev_colour, next_colour) > tau) {
             result.x = x - u - 1;
             break;
@@ -42,7 +47,7 @@ __device__ uchar4 calculate_support_region(const TextureObject<T> &img, int x, i
 	
 	prev_colour = colour;
     for (u=x+1; u <= x_max; ++u) {
-		T next_colour = img.tex2D(u,y);
+		auto next_colour = img.tex2D((float)u+0.5f,(float)y+0.5f);
         if (cross(prev_colour, next_colour) > tau) {
             result.y = u - x - 1;
             break;
@@ -54,7 +59,7 @@ __device__ uchar4 calculate_support_region(const TextureObject<T> &img, int x, i
 	int v;
 	prev_colour = colour;
     for (v=y-1; v >= y_min; --v) {
-		T next_colour = img.tex2D(x,v);
+		auto next_colour = img.tex2D((float)x+0.5f,(float)v+0.5f);
         if (cross(prev_colour, next_colour) > tau) {
             result.z = y - v - 1;
             break;
@@ -65,7 +70,7 @@ __device__ uchar4 calculate_support_region(const TextureObject<T> &img, int x, i
 
 	prev_colour = colour;
     for (v=y+1; v <= y_max; ++v) {
-		T next_colour = img.tex2D(x,v);
+		auto next_colour = img.tex2D((float)x+0.5f,(float)v+0.5f);
         if (cross(prev_colour, next_colour) > tau) {
             result.w = v - y - 1;
             break;
@@ -147,10 +152,10 @@ __global__ void vis_support_region_kernel(TextureObject<uchar4> colour, TextureO
 
 		for (int u=-baseY.x; u<=baseY.y; ++u) {
 			if (x+u < 0 || y+v < 0 || x+u >= colour.width() || y+v >= colour.height()) continue;
-			uchar4 col = colour.tex2D(x+u, y+v);
+			auto col = colour.tex2D(float(x+u)+0.5f, float(y+v)+0.5f);
 			colour(x+u, y+v) = (u==0 || v == 0) ?
-					make_uchar4(max(bcolour.x, col.x), max(bcolour.y, col.y), max(bcolour.z, col.z), 0) :
-					make_uchar4(max(acolour.x, col.x), max(acolour.y, col.y), max(acolour.z, col.z), 0);
+					make_uchar4(max(bcolour.x, (unsigned char)col.x), max(bcolour.y, (unsigned char)col.y), max(bcolour.z, (unsigned char)col.z), 0) :
+					make_uchar4(max(acolour.x, (unsigned char)col.x), max(acolour.y, (unsigned char)col.y), max(acolour.z, (unsigned char)col.z), 0);
 		}
 	}
 }
@@ -198,7 +203,7 @@ __global__ void vis_bad_region_kernel(
 
 	uchar4 base = region.tex2D(x,y);
 	uchar4 baseD = dregion.tex2D(x,y);
-	uchar4 col = colour.tex2D(x,y);
+	auto col = colour.tex2D((float)x+0.5f,(float)y+0.5f);
 	float d = depth.tex2D(x,y);
 
 	if (baseD.x > base.x && baseD.y < base.y) {
diff --git a/components/renderers/cpp/include/ftl/cuda/weighting.hpp b/components/renderers/cpp/include/ftl/cuda/weighting.hpp
index bffff673d..b0c3f58b9 100644
--- a/components/renderers/cpp/include/ftl/cuda/weighting.hpp
+++ b/components/renderers/cpp/include/ftl/cuda/weighting.hpp
@@ -44,6 +44,20 @@ __device__ inline float colourDistance(uchar4 a, uchar4 b) {
 	return ch*ch*ch*ch;
 }
 
+/*
+ * Colour weighting as suggested in:
+ * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014.
+ * c = colour distance
+ */
+ __device__ inline float colourWeighting(const float4 &a, const float4 &b, float h) {
+	const float3 delta = make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+	const float c = length(delta);
+	if (c >= h) return 0.0f;
+	float ch = c / h;
+	ch = 1.0f - ch*ch;
+	return ch*ch*ch*ch;
+}
+
 }
 }
 
diff --git a/components/renderers/cpp/src/reprojection.cu b/components/renderers/cpp/src/reprojection.cu
index e74b4bb60..f1a9022a5 100644
--- a/components/renderers/cpp/src/reprojection.cu
+++ b/components/renderers/cpp/src/reprojection.cu
@@ -80,7 +80,7 @@ __global__ void reprojection_kernel(
 	const float3 camPos = poseInv * worldPos;
 	if (camPos.z < camera.minDepth) return;
 	if (camPos.z > camera.maxDepth) return;
-	const uint2 screenPos = camera.camToScreen<uint2>(camPos);
+	const float2 screenPos = camera.camToScreen<float2>(camPos);
 
 	// Not on screen so stop now...
 	if (screenPos.x >= depth_src.width() || screenPos.y >= depth_src.height()) return;
@@ -91,8 +91,8 @@ __global__ void reprojection_kernel(
 	ray = ray / length(ray);
 	const float dotproduct = max(dot(ray,n),0.0f);
     
-	const float d2 = depth_src.tex2D((int)screenPos.x, (int)screenPos.y);
-	const A input = in.tex2D((int)screenPos.x, (int)screenPos.y); //generateInput(in.tex2D((int)screenPos.x, (int)screenPos.y), params, worldPos);
+	const float d2 = depth_src.tex2D(int(screenPos.x+0.5f), int(screenPos.y+0.5f));
+	const auto input = in.tex2D(screenPos.x, screenPos.y); //generateInput(in.tex2D((int)screenPos.x, (int)screenPos.y), params, worldPos);
 
 	// TODO: Z checks need to interpolate between neighbors if large triangles are used
 	float weight = ftl::cuda::weighting(fabs(camPos.z - d2), 0.02f);
diff --git a/components/renderers/cpp/src/tri_render.cpp b/components/renderers/cpp/src/tri_render.cpp
index 6d60a5f66..72e4f0546 100644
--- a/components/renderers/cpp/src/tri_render.cpp
+++ b/components/renderers/cpp/src/tri_render.cpp
@@ -411,6 +411,7 @@ bool Triangular::render(ftl::rgbd::VirtualSource *src, ftl::rgbd::Frame &out) {
 	
 	out.create<GpuMat>(Channel::Depth, Format<float>(camera.width, camera.height));
 	out.create<GpuMat>(Channel::Colour, Format<uchar4>(camera.width, camera.height));
+	out.createTexture<uchar4>(Channel::Colour, true);  // Force interpolated colour
 
 
 	if (scene_->frames.size() == 0) return false;
@@ -504,7 +505,7 @@ bool Triangular::render(ftl::rgbd::VirtualSource *src, ftl::rgbd::Frame &out) {
 
 		if (chan == Channel::Normals) {
 			// Convert normal to single float value
-			temp_.create<GpuMat>(Channel::Colour, Format<uchar4>(camera.width, camera.height)).setTo(cv::Scalar(0,0,0,0), cvstream);
+			temp_.create<GpuMat>(Channel::Colour, Format<uchar4>(out.get<GpuMat>(Channel::Colour).size())).setTo(cv::Scalar(0,0,0,0), cvstream);
 			ftl::cuda::normal_visualise(scene_->frames[aligned_source].getTexture<float4>(Channel::Normals), temp_.createTexture<uchar4>(Channel::Colour),
 					light_pos_,
 					light_diffuse_,
@@ -532,15 +533,19 @@ bool Triangular::render(ftl::rgbd::VirtualSource *src, ftl::rgbd::Frame &out) {
 	{
 		// Just convert int depth to float depth
 		//temp_.get<GpuMat>(Channel::Depth2).convertTo(out.get<GpuMat>(Channel::Depth), CV_32F, 1.0f / 100000.0f, cvstream);
-	} else if (chan == Channel::Normals) {
+	} else if (chan == Channel::ColourNormals) {
 		// Visualise normals to RGBA
-		accum_.create<GpuMat>(Channel::Normals, Format<uchar4>(camera.width, camera.height)).setTo(cv::Scalar(0,0,0,0), cvstream);
-		ftl::cuda::normal_visualise(out.getTexture<float4>(Channel::Normals), accum_.createTexture<uchar4>(Channel::Normals),
+		out.create<GpuMat>(Channel::ColourNormals, Format<uchar4>(camera.width, camera.height)).setTo(cv::Scalar(0,0,0,0), cvstream);
+
+		ftl::cuda::normal_visualise(out.getTexture<float4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals),
 				light_pos_,
 				light_diffuse_,
 				light_ambient_, stream_);
 
-		accum_.swapTo(Channels(Channel::Normals), out);
+		//accum_.swapTo(Channels(Channel::Normals), out);
+		//cv::cuda::swap(accum_.get<GpuMat>(Channel::Normals), out.get<GpuMat>(Channel::Normals));
+		//out.resetTexture(Channel::Normals);
+		//accum_.resetTexture(Channel::Normals);
 	}
 	//else if (chan == Channel::Contribution)
 	//{
diff --git a/components/rgbd-sources/include/ftl/rgbd/frame.hpp b/components/rgbd-sources/include/ftl/rgbd/frame.hpp
index 52bbe9022..d7b8292ff 100644
--- a/components/rgbd-sources/include/ftl/rgbd/frame.hpp
+++ b/components/rgbd-sources/include/ftl/rgbd/frame.hpp
@@ -81,14 +81,14 @@ public:
 	 * argument to also create (or recreate) the associated GpuMat.
 	 */
 	template <typename T>
-	ftl::cuda::TextureObject<T> &createTexture(ftl::codecs::Channel c, const ftl::rgbd::Format<T> &f);
+	ftl::cuda::TextureObject<T> &createTexture(ftl::codecs::Channel c, const ftl::rgbd::Format<T> &f, bool interpolated=false);
 
 	/**
 	 * Create a CUDA texture object for a channel. With this version the GpuMat
 	 * must already exist and be of the correct type.
 	 */
 	template <typename T>
-	ftl::cuda::TextureObject<T> &createTexture(ftl::codecs::Channel c);
+	ftl::cuda::TextureObject<T> &createTexture(ftl::codecs::Channel c, bool interpolated=false);
 
 	void resetTexture(ftl::codecs::Channel c);
 
@@ -200,7 +200,7 @@ ftl::cuda::TextureObject<T> &Frame::getTexture(ftl::codecs::Channel c) {
 }
 
 template <typename T>
-ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, const ftl::rgbd::Format<T> &f) {
+ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, const ftl::rgbd::Format<T> &f, bool interpolated) {
 	if (!channels_.has(c)) channels_ += c;
 	if (!gpu_.has(c)) gpu_ += c;
 
@@ -221,18 +221,18 @@ ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, const
 
 	if (m.tex.devicePtr() == nullptr) {
 		//LOG(INFO) << "Creating texture object";
-		m.tex = ftl::cuda::TextureObject<T>(m.gpu);
+		m.tex = ftl::cuda::TextureObject<T>(m.gpu, interpolated);
 	} else if (m.tex.cvType() != ftl::traits::OpenCVType<T>::value || m.tex.width() != m.gpu.cols || m.tex.height() != m.gpu.rows) {
 		LOG(INFO) << "Recreating texture object for '" << ftl::codecs::name(c) << "'";
 		m.tex.free();
-		m.tex = ftl::cuda::TextureObject<T>(m.gpu);
+		m.tex = ftl::cuda::TextureObject<T>(m.gpu, interpolated);
 	}
 
 	return ftl::cuda::TextureObject<T>::cast(m.tex);
 }
 
 template <typename T>
-ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c) {
+ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c, bool interpolated) {
 	if (!channels_.has(c)) throw ftl::exception("createTexture needs a format if the channel does not exist");
 
 	auto &m = _get(c);
@@ -254,11 +254,11 @@ ftl::cuda::TextureObject<T> &Frame::createTexture(ftl::codecs::Channel c) {
 
 	if (m.tex.devicePtr() == nullptr) {
 		//LOG(INFO) << "Creating texture object";
-		m.tex = ftl::cuda::TextureObject<T>(m.gpu);
+		m.tex = ftl::cuda::TextureObject<T>(m.gpu, interpolated);
 	} else if (m.tex.cvType() != ftl::traits::OpenCVType<T>::value || m.tex.width() != m.gpu.cols || m.tex.height() != m.gpu.rows || m.tex.devicePtr() != m.gpu.data) {
 		LOG(INFO) << "Recreating texture object for '" << ftl::codecs::name(c) << "'.";
 		m.tex.free();
-		m.tex = ftl::cuda::TextureObject<T>(m.gpu);
+		m.tex = ftl::cuda::TextureObject<T>(m.gpu, interpolated);
 	}
 
 	return ftl::cuda::TextureObject<T>::cast(m.tex);
-- 
GitLab