From 1848ef73889ff9a47f1666b2b2e8addebf24a31a Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Fri, 27 Mar 2020 11:23:16 +0200
Subject: [PATCH] Resolves #329 optical flow

---
 .../codecs/include/ftl/codecs/channels.hpp    |   3 +-
 components/operators/CMakeLists.txt           |   1 +
 .../operators/include/ftl/cuda/fixed.hpp      |  14 +++
 .../include/ftl/operators/cuda/disparity.hpp  |   2 +
 .../include/ftl/operators/opticalflow.hpp     |   8 +-
 components/operators/src/colours.cpp          |   3 +
 components/operators/src/depth.cpp            |   8 +-
 .../operators/src/disparity/disp2depth.cu     |  29 +++++
 .../operators/src/disparity/fixstars_sgm.cpp  |   7 +-
 .../src/disparity/optflow_smoothing.cu        |   4 +-
 components/operators/src/nvopticalflow.cpp    |  68 +++++++++---
 components/operators/src/opticalflow.cu       | 102 ++++++++++++++++++
 components/operators/src/opticalflow_cuda.hpp |  19 ++++
 .../src/sources/stereovideo/stereovideo.cpp   |   2 +-
 lib/libsgm/src/horizontal_path_aggregation.cu |   2 +-
 15 files changed, 247 insertions(+), 25 deletions(-)
 create mode 100644 components/operators/include/ftl/cuda/fixed.hpp
 create mode 100644 components/operators/src/opticalflow.cu
 create mode 100644 components/operators/src/opticalflow_cuda.hpp

diff --git a/components/codecs/include/ftl/codecs/channels.hpp b/components/codecs/include/ftl/codecs/channels.hpp
index 7cd5827f3..43ef166a2 100644
--- a/components/codecs/include/ftl/codecs/channels.hpp
+++ b/components/codecs/include/ftl/codecs/channels.hpp
@@ -22,7 +22,8 @@ enum struct Channel : int {
 	Confidence		= 7,	// 32F
 	Contribution	= 7,	// 32F
 	EnergyVector	= 8,	// 32FC4
-	Flow			= 9,	// 32F
+	Flow			= 9,	// 16SC2
+	Flow2			= 10,	// 16SC2
 	Energy			= 10,	// 32F
 	Mask			= 11,	// 32U
 	Density			= 12,	// 32F
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index 0fd5466ae..becb9f60b 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -42,6 +42,7 @@ endif (HAVE_LIBSGM)
 if (HAVE_OPTFLOW)
 	list(APPEND OPERSRC
 		src/nvopticalflow.cpp
+		src/opticalflow.cu
 		src/disparity/optflow_smoothing.cu
 		src/disparity/optflow_smoothing.cpp)
 endif()
diff --git a/components/operators/include/ftl/cuda/fixed.hpp b/components/operators/include/ftl/cuda/fixed.hpp
new file mode 100644
index 000000000..efb0c09ca
--- /dev/null
+++ b/components/operators/include/ftl/cuda/fixed.hpp
@@ -0,0 +1,14 @@
+#ifndef _FTL_CUDA_FIXED_HPP_
+#define _FTL_CUDA_FIXED_HPP_
+
+template <int FRAC>
+__device__ inline float fixed2float(short v) {
+    return v / (1 << FRAC);
+}
+
+template <int FRAC>
+__device__ inline short float2fixed(float v) {
+    return short(v * float(1 << FRAC));
+}
+
+#endif
\ No newline at end of file
diff --git a/components/operators/include/ftl/operators/cuda/disparity.hpp b/components/operators/include/ftl/operators/cuda/disparity.hpp
index 75034d9e1..4f421969c 100644
--- a/components/operators/include/ftl/operators/cuda/disparity.hpp
+++ b/components/operators/include/ftl/operators/cuda/disparity.hpp
@@ -29,6 +29,8 @@ void show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, const cv::cu
 void show_disp_density(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left,
 			float scale, cudaStream_t stream);
 
+void merge_disparities(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &estimate, cudaStream_t stream);
+
 
 void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
 					cv::cuda::GpuMat &history, cv::cuda::GpuMat &support, int n_max, float threshold, bool fill,
diff --git a/components/operators/include/ftl/operators/opticalflow.hpp b/components/operators/include/ftl/operators/opticalflow.hpp
index 43edf6b84..8ee77736d 100644
--- a/components/operators/include/ftl/operators/opticalflow.hpp
+++ b/components/operators/include/ftl/operators/opticalflow.hpp
@@ -13,7 +13,7 @@ namespace operators {
 class NVOpticalFlow : public ftl::operators::Operator {
 	public:
 	explicit NVOpticalFlow(ftl::Configurable*);
-	NVOpticalFlow(ftl::Configurable*, const std::tuple<ftl::codecs::Channel,ftl::codecs::Channel> &channels);
+	NVOpticalFlow(ftl::Configurable*, const std::tuple<ftl::codecs::Channel,ftl::codecs::Channel,ftl::codecs::Channel,ftl::codecs::Channel> &channels);
 	~NVOpticalFlow();
 
 	inline Operator::Type type() const override { return Operator::Type::OneToOne; }
@@ -27,12 +27,14 @@ class NVOpticalFlow : public ftl::operators::Operator {
 	cv::Size size_;
 	
 	// TODO: Colour to Flow always assumed, could also calculate something else?
-	ftl::codecs::Channel channel_in_;
-	ftl::codecs::Channel channel_out_;
+	ftl::codecs::Channel channel_in_[2];
+	ftl::codecs::Channel channel_out_[2];
 
 	cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> nvof_;
 	cv::cuda::GpuMat left_gray_;
 	cv::cuda::GpuMat left_gray_prev_;
+	cv::cuda::GpuMat right_gray_;
+	cv::cuda::GpuMat right_gray_prev_;
 };
 
 }
diff --git a/components/operators/src/colours.cpp b/components/operators/src/colours.cpp
index 2c102482e..8ca2c62bf 100644
--- a/components/operators/src/colours.cpp
+++ b/components/operators/src/colours.cpp
@@ -45,6 +45,9 @@ bool ColourChannels::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStre
 
 	//in.resetTexture(Channel::Colour);
 	in.createTexture<uchar4>(Channel::Colour, true);
+	if (in.hasChannel(Channel::Right)) {
+		in.createTexture<uchar4>(Channel::Right, true);
+	}
 
 	if (in.hasChannel(Channel::Depth)) {
 		auto &depth = in.get<cv::cuda::GpuMat>(Channel::Depth);
diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp
index 5ffb6bbaf..c3749419b 100644
--- a/components/operators/src/depth.cpp
+++ b/components/operators/src/depth.cpp
@@ -140,13 +140,13 @@ void DepthChannel::_createPipeline(size_t size) {
 
 	pipe_->append<ftl::operators::ColourChannels>("colour");  // Convert BGR to BGRA
 	pipe_->append<ftl::operators::CrossSupport>("cross");
+	#ifdef HAVE_OPTFLOW
+	pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow, Channel::Colour2, Channel::Flow2);
+	//if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
+	#endif
 	#ifdef HAVE_LIBSGM
 	pipe_->append<ftl::operators::FixstarsSGM>("algorithm");
 	#endif
-	#ifdef HAVE_OPTFLOW
-	pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow);
-	if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
-	#endif
 	pipe_->append<ftl::operators::DisparityBilateralFilter>("bilateral_filter");
 	//pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
 	pipe_->append<ftl::operators::DisparityToDepth>("calculate_depth");
diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu
index bed3bc3e8..b8f878ac0 100644
--- a/components/operators/src/disparity/disp2depth.cu
+++ b/components/operators/src/disparity/disp2depth.cu
@@ -3,6 +3,7 @@
 #include <opencv2/core/cuda_stream_accessor.hpp>
 #include <ftl/operators/cuda/disparity.hpp>
 #include <ftl/operators/cuda/mask.hpp>
+#include <ftl/cuda/fixed.hpp>
 
 #ifndef PINF
 #define PINF __int_as_float(0x7f800000)
@@ -208,6 +209,34 @@ void ftl::cuda::show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, c
 // =============================================================================
 
 
+__global__ void merge_disp_kernel(cv::cuda::PtrStepSz<short> disp,
+	cv::cuda::PtrStepSz<short> estimate)
+{
+	for (STRIDE_Y(v,disp.rows)) {
+	for (STRIDE_X(u,disp.cols)) {
+		short cd = disp(v,u);
+		float d = fixed2float<4>((cd >= 4096) ? 0 : cd);
+		float e = fixed2float<4>(estimate(v,u));
+
+		if (e == 0.0f) d = 0.0f;
+		if (fabsf(d-e) > 4.0f) d = 0.0f;
+		disp(v,u) = float2fixed<4>(d);
+	}
+	}
+}
+
+void ftl::cuda::merge_disparities(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &estimate, cudaStream_t stream) {
+	dim3 grid(1,1,1);
+	dim3 threads(128, 4, 1);
+	grid.x = cv::cuda::device::divUp(disp.cols, 128);
+	grid.y = cv::cuda::device::divUp(disp.rows, 4);
+	merge_disp_kernel<<<grid, threads, 0, stream>>>(disp, estimate);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// =============================================================================
+
+
 template <int MAX_DISP>
 __global__ void show_disp_density_kernel(cv::cuda::PtrStepSz<short> disp,
 	cv::cuda::PtrStepSz<uchar4> left,
diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index b55a7af6d..0a8bdf0dd 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -155,7 +155,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 		if (!init()) { return false; }
 	}
 
-	auto &disp = out.create<GpuMat>(Channel::Disparity, Format<short>(l.size()));
+	bool has_estimate = in.hasChannel(Channel::Disparity);
+	auto &disp = (!has_estimate) ? out.create<GpuMat>(Channel::Disparity, Format<short>(l.size())) : in.get<GpuMat>(Channel::Disparity);
 
 	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
 	cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
@@ -169,6 +170,10 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 	// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
 	// left_pixels.setTo(0);
 
+	if (config()->value("merge_estimates", false) && has_estimate) {
+		ftl::cuda::merge_disparities(disp_int_, disp, stream);
+	}
+
 	cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
 
 	if (config()->value("check_reprojection", false)) {
diff --git a/components/operators/src/disparity/optflow_smoothing.cu b/components/operators/src/disparity/optflow_smoothing.cu
index c30302b85..c85c22cd2 100644
--- a/components/operators/src/disparity/optflow_smoothing.cu
+++ b/components/operators/src/disparity/optflow_smoothing.cu
@@ -50,7 +50,7 @@ __global__ void temporal_median_filter_kernel(
 
 		float t = area * threshold + 0.25f;  // 0.25 is the 1/4 pixel accuracy NVIDIA claim
 
-		if (max(abs(flow.x),abs(flow.y)) > makeFixed<5>(t))
+		if (sqrt(float(flow.x*flow.x) + float(flow.y*flow.y)) > makeFixed<5>(t))  // max(abs(flow.x),abs(flow.y))
 		{
 			// TODO: Perhaps rather than discard it could follow the optical flow
 			// This would require the generation of a depth flow also.
@@ -73,7 +73,7 @@ __global__ void temporal_median_filter_kernel(
 
 		const float disparity = disp(y, x);
 
-		if (isValidDisparity(disparity))
+		//if (isValidDisparity(disparity))
 		{
 			history(y, (x + 1) * n_max - 1) += 1.0;
 			count++;
diff --git a/components/operators/src/nvopticalflow.cpp b/components/operators/src/nvopticalflow.cpp
index 422fc3040..2426c1019 100644
--- a/components/operators/src/nvopticalflow.cpp
+++ b/components/operators/src/nvopticalflow.cpp
@@ -1,5 +1,8 @@
 #include <ftl/operators/opticalflow.hpp>
 #include <ftl/exception.hpp>
+#include <ftl/operators/cuda/disparity.hpp>
+
+#include "opticalflow_cuda.hpp"
 
 #include <opencv2/cudaimgproc.hpp>
 
@@ -16,14 +19,17 @@ using cv::Size;
 using cv::cuda::GpuMat;
 
 NVOpticalFlow::NVOpticalFlow(ftl::Configurable* cfg) :
-		ftl::operators::Operator(cfg), channel_in_(ftl::codecs::Channel::Colour), channel_out_(ftl::codecs::Channel::Flow) {
+		ftl::operators::Operator(cfg), channel_in_{ftl::codecs::Channel::Colour,ftl::codecs::Channel::Colour2}, channel_out_{ftl::codecs::Channel::Flow,ftl::codecs::Channel::Flow2} {
 	size_ = Size(0, 0);
 
 }
 
-NVOpticalFlow::NVOpticalFlow(ftl::Configurable*cfg, const std::tuple<ftl::codecs::Channel,ftl::codecs::Channel> &channels) : ftl::operators::Operator(cfg) {
-	channel_in_ = std::get<0>(channels);
-	channel_out_ = std::get<1>(channels);
+NVOpticalFlow::NVOpticalFlow(ftl::Configurable*cfg, const std::tuple<ftl::codecs::Channel,ftl::codecs::Channel,ftl::codecs::Channel,ftl::codecs::Channel> &channels) : ftl::operators::Operator(cfg) {
+	channel_in_[0] = std::get<0>(channels);
+	channel_out_[0] = std::get<1>(channels);
+	channel_in_[1] = std::get<2>(channels);
+	channel_out_[1] = std::get<3>(channels);
+	size_ = Size(0, 0);
 }
 
 NVOpticalFlow::~NVOpticalFlow() {
@@ -43,27 +49,65 @@ bool NVOpticalFlow::init() {
 	
 	left_gray_.create(size_, CV_8UC1);
 	left_gray_prev_.create(size_, CV_8UC1);
+	right_gray_.create(size_, CV_8UC1);
+	right_gray_prev_.create(size_, CV_8UC1);
 	return true;
 }
 
 bool NVOpticalFlow::apply(Frame &in, Frame &out, cudaStream_t stream) {
-	if (!in.hasChannel(channel_in_)) return false;
-	if (in.hasChannel(channel_out_)) return true;
+	bool both_channels = config()->value("both_channels", false);
+	float scale = config()->value("viz_scale", 200.0f);
+
+	if (!in.hasChannel(channel_in_[0])) return false;
+	if (in.hasChannel(channel_out_[0])) return true;
+	if (both_channels) {
+		if (!in.hasChannel(channel_in_[1])) return false;
+		if (in.hasChannel(channel_out_[1])) return true;
+	}
 
-	if (in.get<GpuMat>(channel_in_).size() != size_) {
-		size_ = in.get<GpuMat>(channel_in_).size();
+	if (in.get<GpuMat>(channel_in_[0]).size() != size_) {
+		size_ = in.get<GpuMat>(channel_in_[0]).size();
 		if (!init()) return false;
 	}
 	
 	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
-	auto &flow = out.create<GpuMat>(channel_out_);
+	auto &flow1 = out.create<GpuMat>(channel_out_[0]);
 
-	cv::cuda::cvtColor(in.get<GpuMat>(channel_in_), left_gray_, cv::COLOR_BGRA2GRAY, 0, cvstream);
+	cv::cuda::cvtColor(in.get<GpuMat>(channel_in_[0]), left_gray_, cv::COLOR_BGRA2GRAY, 0, cvstream);
+	cv::cuda::cvtColor(in.get<GpuMat>(channel_in_[1]), right_gray_, cv::COLOR_BGRA2GRAY, 0, cvstream);
 
 	// TODO: Use optical flow confidence output, perhaps combined with a
 	// sensitivity adjustment
-	nvof_->calc(left_gray_, left_gray_prev_, flow, cvstream);
-	std::swap(left_gray_, left_gray_prev_);
+	nvof_->calc(left_gray_, right_gray_, flow1, cvstream);
+	//std::swap(left_gray_, left_gray_prev_);
+
+	if (both_channels) {
+		auto &flow2 = out.create<GpuMat>(channel_out_[1]);
+		nvof_->calc(right_gray_, left_gray_, flow2, cvstream);
+		//std::swap(right_gray_, right_gray_prev_);
+	}
+
+	if (config()->value("show_flow", false)) {
+		ftl::cuda::show_optical_flow(flow1, in.getTexture<uchar4>(channel_in_[0]), scale, stream);
+		if (both_channels)
+			ftl::cuda::show_optical_flow(out.get<GpuMat>(channel_out_[1]), in.getTexture<uchar4>(channel_in_[1]), scale, stream);
+	}
+
+	if (config()->value("generate_disparity", false)) {
+		if (!in.hasChannel(Channel::Disparity)) {
+			in.create<GpuMat>(Channel::Disparity).create(size_, CV_16SC1);
+		}
+		ftl::cuda::disparity_from_flow(
+			flow1,
+			out.get<GpuMat>(channel_out_[1]),
+			in.createTexture<short>(Channel::Disparity), stream);
+	}
+
+	if (config()->value("check_reprojection", false)) {
+		ftl::cuda::check_reprojection(in.get<cv::cuda::GpuMat>(Channel::Disparity), in.getTexture<uchar4>(Channel::Colour),
+			in.createTexture<uchar4>(Channel::Colour2, true),
+			stream);
+	}
 
 	return true;
 }
\ No newline at end of file
diff --git a/components/operators/src/opticalflow.cu b/components/operators/src/opticalflow.cu
new file mode 100644
index 000000000..0876fe1fa
--- /dev/null
+++ b/components/operators/src/opticalflow.cu
@@ -0,0 +1,102 @@
+#include "opticalflow_cuda.hpp"
+#include <ftl/cuda/fixed.hpp>
+
+#define T_PER_BLOCK 8
+
+using ftl::cuda::TextureObject;
+
+__global__ void show_optflow_kernel(
+        cv::cuda::PtrStepSz<short2> optflow,
+        TextureObject<uchar4> colour, float scale) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < colour.width() && y < colour.height()) {
+        short2 flow = optflow(y/4, x/4);  // 4 is granularity
+
+        float fx = max(-1.0f, min(1.0f, fixed2float<5>(flow.x) / scale));
+        float fy = max(-1.0f, min(1.0f, fixed2float<5>(flow.y) / scale));
+        float f = sqrt(fx*fx+fy*fy);
+
+        float4 c = colour.tex2D(float(x)+0.5f, float(y)+0.5f);
+        c += make_float4(f*255.0f, 0.0f, f*255.0f, 0.0f);
+        colour(x,y) = make_uchar4(min(255.0f, c.x), min(255.0f, c.y), min(255.0f, c.z), 0.0f);
+    }
+}
+
+void ftl::cuda::show_optical_flow(const cv::cuda::GpuMat &optflow, const TextureObject<uchar4> &colour, float scale, cudaStream_t stream) {
+    const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    show_optflow_kernel<<<gridSize, blockSize, 0, stream>>>(optflow, colour, scale);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+// ==== Flow difference ========================================================
+
+__global__ void gen_disparity_kernel(
+        cv::cuda::PtrStepSz<short2> optflow1,
+        cv::cuda::PtrStepSz<short2> optflow2,
+        TextureObject<short> disparity) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < disparity.width() && y < disparity.height()) {
+        short2 flow1 = optflow1(y/4, x/4);  // 4 is granularity
+        float disp = -fixed2float<5>(flow1.x);
+
+        // Do a consistency check
+        if (disp > 0.0f && x-disp-0.5f > 0) { //< colour.width()) {
+            short2 flow2 = optflow2(y/4, (x-round(disp))/4);  // 4 is granularity
+            if (fabsf(disp - fixed2float<5>(flow2.x)) > 1.0f) disp = 0.0f;
+        }
+
+        disparity(x,y) = float2fixed<4>(disp);
+    }
+}
+
+void ftl::cuda::disparity_from_flow(const cv::cuda::GpuMat &optflow1, const cv::cuda::GpuMat &optflow2, const TextureObject<short> &disparity, cudaStream_t stream) {
+    const dim3 gridSize((disparity.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (disparity.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    gen_disparity_kernel<<<gridSize, blockSize, 0, stream>>>(optflow1, optflow2, disparity);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+__global__ void show_optflow_diff_kernel(
+        cv::cuda::PtrStepSz<short2> optflow1,
+        cv::cuda::PtrStepSz<short2> optflow2,
+        TextureObject<uchar4> colour, float scale,
+        ftl::rgbd::Camera cam) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (x < colour.width() && y < colour.height()) {
+        short2 flow1 = optflow1(y/4, x/4);  // 4 is granularity
+        float disp = fixed2float<5>(flow1.x);
+
+        if (disp > 0.0f && x-disp-0.5f > 0) { //< colour.width()) {
+            short2 flow2 = optflow2(y/4, (x-round(disp))/4);  // 4 is granularity
+
+            float dx = (fixed2float<5>(flow1.x) + fixed2float<5>(flow2.x)) / disp;
+            float fR = max(0.0f, min(1.0f, dx / scale));
+            float fB = -max(-1.0f, min(0.0f, dx / scale));
+            float fG = (fR == 1.0f || fB == 1.0f) ? 0.0f : 1.0f;
+
+            float4 c = colour.tex2D(float(x)+0.5f, float(y)+0.5f);
+            c += make_float4(fG*fB*255.0f, (1.0f-fG)*255.0f, fG*fR*255.0f, 0.0f);
+            colour(x,y) = make_uchar4(min(255.0f, c.x), min(255.0f, c.y), min(255.0f, c.z), 0.0f);
+        }
+    }
+}
+
+void ftl::cuda::show_optical_flow_diff(const cv::cuda::GpuMat &optflow1, const cv::cuda::GpuMat &optflow2, const TextureObject<uchar4> &colour, float scale, const ftl::rgbd::Camera &cam, cudaStream_t stream) {
+    const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+    show_optflow_diff_kernel<<<gridSize, blockSize, 0, stream>>>(optflow1, optflow2, colour, scale, cam);
+    cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/opticalflow_cuda.hpp b/components/operators/src/opticalflow_cuda.hpp
new file mode 100644
index 000000000..f5b22b75d
--- /dev/null
+++ b/components/operators/src/opticalflow_cuda.hpp
@@ -0,0 +1,19 @@
+#ifndef _FTL_OPTFLOW_CUDA_HPP_
+#define _FTL_OPTFLOW_CUDA_HPP_
+
+#include <ftl/cuda_common.hpp>
+#include <ftl/rgbd/camera.hpp>
+
+namespace ftl {
+namespace cuda {
+
+void show_optical_flow(const cv::cuda::GpuMat &optflow, const ftl::cuda::TextureObject<uchar4> &colour, float scale, cudaStream_t);
+
+void disparity_from_flow(const cv::cuda::GpuMat &optflow1, const cv::cuda::GpuMat &optflow2, const ftl::cuda::TextureObject<short> &disparity, cudaStream_t stream);
+
+void show_optical_flow_diff(const cv::cuda::GpuMat &optflow1, const cv::cuda::GpuMat &optflow2, const ftl::cuda::TextureObject<uchar4> &colour, float scale, const ftl::rgbd::Camera &cam, cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
index 4ddc992b6..8f8297c58 100644
--- a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
+++ b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
@@ -79,7 +79,7 @@ void StereoVideoSource::init(const string &file) {
 
 	pipeline_input_ = ftl::config::create<ftl::operators::Graph>(host_, "input");
 	#ifdef HAVE_OPTFLOW
-	pipeline_input_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow);
+	pipeline_input_->append<ftl::operators::NVOpticalFlow>("optflow");
 	#endif
 	pipeline_input_->append<ftl::operators::ColourChannels>("colour");
 
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
index b3772b6f6..860bb3d69 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.cu
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -21,7 +21,7 @@ limitations under the License.
 namespace sgm {
 namespace path_aggregation {
 
-static constexpr unsigned int DP_BLOCK_SIZE = 8u;
+static constexpr unsigned int DP_BLOCK_SIZE = 16u;
 static constexpr unsigned int DP_BLOCKS_PER_THREAD = 1u;
 
 static constexpr unsigned int WARPS_PER_BLOCK = 4u;
-- 
GitLab