From 113fa4926483d19dbcade34fe7317d162dca9621 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Fri, 17 Apr 2020 09:13:42 +0300
Subject: [PATCH] Libstereo custom SGM algorithms

---
 CMakeLists.txt                                |   8 +-
 components/operators/CMakeLists.txt           |   3 +-
 .../include/ftl/operators/disparity.hpp       |  33 +-
 components/operators/src/depth.cpp            |   2 +
 .../operators/src/disparity/libstereo.cpp     |  68 +++
 lib/libstereo/CMakeLists.txt                  |  72 ++++
 lib/libstereo/include/stereo.hpp              | 164 ++++++++
 lib/libstereo/include/stereo_types.hpp        |  21 +
 lib/libstereo/middlebury/CMakeLists.txt       |   8 +
 lib/libstereo/middlebury/main.cpp             | 159 +++++++
 lib/libstereo/middlebury/middlebury.cpp       | 171 ++++++++
 lib/libstereo/middlebury/middlebury.hpp       |  34 ++
 .../src/aggregations/standard_sgm.hpp         | 120 ++++++
 lib/libstereo/src/array2d.hpp                 | 136 ++++++
 lib/libstereo/src/consistency.hpp             |  36 ++
 lib/libstereo/src/cost_aggregation.hpp        | 120 ++++++
 lib/libstereo/src/costs/absolute_diff.hpp     |  50 +++
 lib/libstereo/src/costs/census.cu             |  48 +++
 lib/libstereo/src/costs/census.hpp            |  70 ++++
 lib/libstereo/src/costs/dual.hpp              |  51 +++
 lib/libstereo/src/costs/gradient.cu           |  28 ++
 lib/libstereo/src/costs/gradient.hpp          |  65 +++
 lib/libstereo/src/costs/mutual_information.cu | 157 +++++++
 .../src/costs/mutual_information.hpp          |  65 +++
 lib/libstereo/src/dsbase.hpp                  |  64 +++
 lib/libstereo/src/dsbase_impl.hpp             |   5 +
 lib/libstereo/src/dsi.hpp                     |  74 ++++
 lib/libstereo/src/dsi_impl.hpp                | 118 ++++++
 lib/libstereo/src/memory.hpp                  |  54 +++
 lib/libstereo/src/stereo_adcensussgm.cu       | 147 +++++++
 lib/libstereo/src/stereo_adsgm.cu             | 138 ++++++
 lib/libstereo/src/stereo_censussgm.cu         | 140 +++++++
 lib/libstereo/src/stereo_gradientstree.cu     | 141 +++++++
 lib/libstereo/src/stereo_misgm.cu             | 152 +++++++
 lib/libstereo/src/stereo_sgmp.cpp             | 394 ++++++++++++++++++
 lib/libstereo/src/util.hpp                    | 135 ++++++
 lib/libstereo/src/util_opencv.hpp             | 153 +++++++
 lib/libstereo/src/wta.hpp                     | 199 +++++++++
 lib/libstereo/test/CMakeLists.txt             |  65 +++
 lib/libstereo/test/aggregation_unit.cpp       | 276 ++++++++++++
 lib/libstereo/test/array2d_unit.cpp           |  11 +
 lib/libstereo/test/dsi_gpu_unit.cu            |  11 +
 lib/libstereo/test/dsi_unit.cpp               |  11 +
 lib/libstereo/test/matching_cost_unit.cpp     |  40 ++
 lib/libstereo/test/tests.cpp                  |   3 +
 lib/libstereo/test/wta_unit.cpp               |  72 ++++
 46 files changed, 4083 insertions(+), 9 deletions(-)
 create mode 100644 components/operators/src/disparity/libstereo.cpp
 create mode 100644 lib/libstereo/CMakeLists.txt
 create mode 100644 lib/libstereo/include/stereo.hpp
 create mode 100644 lib/libstereo/include/stereo_types.hpp
 create mode 100644 lib/libstereo/middlebury/CMakeLists.txt
 create mode 100644 lib/libstereo/middlebury/main.cpp
 create mode 100644 lib/libstereo/middlebury/middlebury.cpp
 create mode 100644 lib/libstereo/middlebury/middlebury.hpp
 create mode 100644 lib/libstereo/src/aggregations/standard_sgm.hpp
 create mode 100644 lib/libstereo/src/array2d.hpp
 create mode 100644 lib/libstereo/src/consistency.hpp
 create mode 100644 lib/libstereo/src/cost_aggregation.hpp
 create mode 100644 lib/libstereo/src/costs/absolute_diff.hpp
 create mode 100644 lib/libstereo/src/costs/census.cu
 create mode 100644 lib/libstereo/src/costs/census.hpp
 create mode 100644 lib/libstereo/src/costs/dual.hpp
 create mode 100644 lib/libstereo/src/costs/gradient.cu
 create mode 100644 lib/libstereo/src/costs/gradient.hpp
 create mode 100644 lib/libstereo/src/costs/mutual_information.cu
 create mode 100644 lib/libstereo/src/costs/mutual_information.hpp
 create mode 100644 lib/libstereo/src/dsbase.hpp
 create mode 100644 lib/libstereo/src/dsbase_impl.hpp
 create mode 100644 lib/libstereo/src/dsi.hpp
 create mode 100644 lib/libstereo/src/dsi_impl.hpp
 create mode 100644 lib/libstereo/src/memory.hpp
 create mode 100644 lib/libstereo/src/stereo_adcensussgm.cu
 create mode 100644 lib/libstereo/src/stereo_adsgm.cu
 create mode 100644 lib/libstereo/src/stereo_censussgm.cu
 create mode 100644 lib/libstereo/src/stereo_gradientstree.cu
 create mode 100644 lib/libstereo/src/stereo_misgm.cu
 create mode 100644 lib/libstereo/src/stereo_sgmp.cpp
 create mode 100644 lib/libstereo/src/util.hpp
 create mode 100644 lib/libstereo/src/util_opencv.hpp
 create mode 100644 lib/libstereo/src/wta.hpp
 create mode 100644 lib/libstereo/test/CMakeLists.txt
 create mode 100644 lib/libstereo/test/aggregation_unit.cpp
 create mode 100644 lib/libstereo/test/array2d_unit.cpp
 create mode 100644 lib/libstereo/test/dsi_gpu_unit.cu
 create mode 100644 lib/libstereo/test/dsi_unit.cpp
 create mode 100644 lib/libstereo/test/matching_cost_unit.cpp
 create mode 100644 lib/libstereo/test/tests.cpp
 create mode 100644 lib/libstereo/test/wta_unit.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1a93b367..95a37c796 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,12 @@ endif()
 
 # ==============================================================================
 
+add_subdirectory(lib/libstereo)
+include_directories(lib/libstereo/include)
+set_property(TARGET libstereo PROPERTY FOLDER "dependencies")
+
+#
+
 if (WITH_FIXSTARS)
 	set(HAVE_LIBSGM true)
 	add_subdirectory(lib/libsgm)
@@ -438,4 +444,4 @@ endif()
 if (WIN32) # TODO(nick) Should do based upon compiler (VS)
 	set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${VS_STARTUP_PROJECT})
 	set_property(TARGET ftl-vision PROPERTY VS_DEBUGGER_WORKING_DIRECTORY ${VS_DEBUG_WORKING_DIRECTORY})
-endif()
\ No newline at end of file
+endif()
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index becb9f60b..dea9b6df6 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -8,6 +8,7 @@ set(OPERSRC
 	src/normals.cpp
 	src/filling.cpp
 	src/filling.cu
+	src/disparity/libstereo.cpp
 	src/disparity/disp2depth.cu
 	src/disparity/disparity_to_depth.cpp
 	src/disparity/bilateral_filter.cpp
@@ -54,6 +55,6 @@ target_include_directories(ftloperators PUBLIC
 	$<INSTALL_INTERFACE:include>
 	PRIVATE src)
 
-target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon sgm Eigen3::Eigen Threads::Threads ${OpenCV_LIBS})
+target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon sgm libstereo Eigen3::Eigen Threads::Threads ${OpenCV_LIBS})
 
 #ADD_SUBDIRECTORY(test)
diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp
index eea830cc4..17cdced1d 100644
--- a/components/operators/include/ftl/operators/disparity.hpp
+++ b/components/operators/include/ftl/operators/disparity.hpp
@@ -17,11 +17,30 @@
 namespace ftl {
 namespace operators {
 
+class StereoDisparity : public ftl::operators::Operator {
+public:
+	explicit StereoDisparity(ftl::Configurable* cfg);
+
+	~StereoDisparity();
+	inline Operator::Type type() const override { return Operator::Type::OneToOne; }
+	bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override;
+
+	bool isMemoryHeavy() const override { return true; }
+
+private:
+	bool init();
+
+	struct Impl;
+	Impl *impl_;
+
+	cv::cuda::GpuMat disp32f_;
+};
+
 #ifdef HAVE_LIBSGM
 /*
  * FixstarsSGM https://github.com/fixstars/libSGM
  *
- * Requires modified version https://gitlab.utu.fi/joseha/libsgm
+ * Requires modified version in lib/libsgm
  */
 class FixstarsSGM : public ftl::operators::Operator {
 	public:
@@ -95,17 +114,17 @@ class DisparityToDepth : public ftl::operators::Operator {
  * disparity to depth steps.
  */
 class DepthChannel : public ftl::operators::Operator {
-    public:
-    explicit DepthChannel(ftl::Configurable *cfg);
-    ~DepthChannel();
+	public:
+	explicit DepthChannel(ftl::Configurable *cfg);
+	~DepthChannel();
 
 	inline Operator::Type type() const override { return Operator::Type::ManyToMany; }
 
-    bool apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) override;
+	bool apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) override;
 	bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override;
 
-    private:
-    ftl::operators::Graph *pipe_;
+	private:
+	ftl::operators::Graph *pipe_;
 	std::vector<cv::cuda::GpuMat> rbuf_;
 	cv::Size depth_size_;
 
diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp
index c3749419b..2c689c2b4 100644
--- a/components/operators/src/depth.cpp
+++ b/components/operators/src/depth.cpp
@@ -146,6 +146,8 @@ void DepthChannel::_createPipeline(size_t size) {
 	#endif
 	#ifdef HAVE_LIBSGM
 	pipe_->append<ftl::operators::FixstarsSGM>("algorithm");
+	#else
+	pipe_->append<ftl::operators::StereoDisparity>("algorithm");
 	#endif
 	pipe_->append<ftl::operators::DisparityBilateralFilter>("bilateral_filter");
 	//pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
diff --git a/components/operators/src/disparity/libstereo.cpp b/components/operators/src/disparity/libstereo.cpp
new file mode 100644
index 000000000..22aaac630
--- /dev/null
+++ b/components/operators/src/disparity/libstereo.cpp
@@ -0,0 +1,68 @@
+#include <loguru.hpp>
+
+#include "ftl/operators/disparity.hpp"
+#include <ftl/operators/cuda/disparity.hpp>
+
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudaarithm.hpp>
+
+#include <stereo.hpp>
+
+using cv::Size;
+using cv::cuda::GpuMat;
+
+using ftl::rgbd::Format;
+using ftl::codecs::Channel;
+using ftl::rgbd::Frame;
+using ftl::rgbd::Source;
+using ftl::operators::StereoDisparity;
+
+struct StereoDisparity::Impl {
+	StereoCensusSgm sgm;
+};
+
+StereoDisparity::StereoDisparity(ftl::Configurable* cfg) :
+		ftl::operators::Operator(cfg), impl_(nullptr) {
+
+	init();
+}
+
+StereoDisparity::~StereoDisparity() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
+
+bool StereoDisparity::init() {
+	if (impl_) { delete impl_; }
+	impl_ = new Impl();
+	impl_->sgm.params.d_min = 0;
+	impl_->sgm.params.d_max = 256;
+	return true;
+}
+
+bool StereoDisparity::apply(Frame &in, Frame &out, cudaStream_t stream) {
+	if (!in.hasChannel(Channel::Left) || !in.hasChannel(Channel::Right)) {
+		LOG(ERROR) << "Left or Right channel missing for stereo disparity";
+		return false;
+	}
+
+	impl_->sgm.params.P1 = config()->value("P1", 10);
+	impl_->sgm.params.P2 = config()->value("P2", 60);
+
+	const auto &l = in.get<GpuMat>(Channel::Left);
+	const auto &r = in.get<GpuMat>(Channel::Right);
+	disp32f_.create(l.size(), CV_32FC1);
+
+	bool has_estimate = in.hasChannel(Channel::Disparity);
+	auto &disp = (!has_estimate) ? out.create<GpuMat>(Channel::Disparity, Format<short>(l.size())) : in.get<GpuMat>(Channel::Disparity);
+
+	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+
+	cvstream.waitForCompletion();
+	impl_->sgm.compute(l, r, disp32f_);
+
+	disp32f_.convertTo(disp, CV_16SC1, 16.0, 0.0, cvstream);
+	return true;
+}
diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
new file mode 100644
index 000000000..8ebb62f64
--- /dev/null
+++ b/lib/libstereo/CMakeLists.txt
@@ -0,0 +1,72 @@
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+
+project(libstereo)
+
+option(BUILD_MIDDLEBURY     "Build Middlebury evaluation" OFF)
+option(BUILD_TESTS          "Build unit tests" ON)
+option(LIBSTEREO_SHARED     "Build a shared library" OFF)
+
+find_package(OpenCV REQUIRED)
+find_package(OpenMP REQUIRED)
+find_package( Threads REQUIRED )
+#find_package(CUDA REQUIRED)
+
+enable_language(CUDA)
+
+set(CMAKE_CXX_CPPCHECK "cppcheck")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_USE_RELATIVE_PATHS ON)
+set(CMAKE_CXX_FLAGS_RELEASE)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+    set(CMAKE_CUDA_FLAGS "--gpu-architecture=compute_61 -std=c++14 -Xcompiler -fPIC -Xcompiler ${OpenMP_CXX_FLAGS}")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+if (LIBSTEREO_SHARED)
+    add_library(libstereo SHARED
+                src/stereo_gradientstree.cu
+                src/stereo_misgm.cu
+                src/stereo_censussgm.cu
+                src/stereo_sgmp.cpp
+                src/costs/census.cu
+                src/costs/gradient.cu
+                src/costs/mutual_information.cu
+    )
+set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp)
+
+else()
+    add_library(libstereo
+                src/stereo_gradientstree.cu
+                src/stereo_misgm.cu
+                src/stereo_censussgm.cu
+                src/stereo_adcensussgm.cu
+                src/stereo_adsgm.cu
+                #src/stereo_sgmp.cpp
+                src/costs/census.cu
+                src/costs/gradient.cu
+                src/costs/mutual_information.cu
+    )
+endif()
+
+if (USE_GPU)
+    target_compile_definitions(libstereo PUBLIC USE_GPU)
+endif()
+
+target_include_directories(libstereo PRIVATE src/ include/)
+target_include_directories(libstereo PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(libstereo Threads::Threads ${OpenCV_LIBS} ${CUDA_LIBRARIES})
+
+if (BUILD_MIDDLEBURY)
+    add_subdirectory(middlebury/)
+endif()
+
+if (BUILD_TESTS)
+    add_subdirectory(test)
+endif()
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
new file mode 100644
index 000000000..bd7b38910
--- /dev/null
+++ b/lib/libstereo/include/stereo.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <opencv2/core/mat.hpp>
+#include <stereo_types.hpp>
+
+class StereoADCensusSgm {
+public:
+	StereoADCensusSgm();
+	~StereoADCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoADSgm {
+public:
+	StereoADSgm();
+	~StereoADSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoCensusSgm {
+public:
+	StereoCensusSgm();
+	~StereoCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoMiSgm {
+public:
+	StereoMiSgm();
+	~StereoMiSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(const cv::Mat &disp);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 2;
+		unsigned short P2 = 8;
+		float uniqueness = std::numeric_limits<short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+
+/**
+ * Census + SGM + prior
+ *
+class StereoCensusSgmP {
+public:
+	StereoCensusSgmP();
+	~StereoCensusSgmP();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(const cv::Mat &prior);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		int range = 10;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};*/
+
+class StereoGradientStree {
+public:
+	StereoGradientStree();
+	~StereoGradientStree();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		float P1 = 5;
+		float P2 = 25;
+		float P3 = 64;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
diff --git a/lib/libstereo/include/stereo_types.hpp b/lib/libstereo/include/stereo_types.hpp
new file mode 100644
index 000000000..c4fa54de4
--- /dev/null
+++ b/lib/libstereo/include/stereo_types.hpp
@@ -0,0 +1,21 @@
+#ifndef _FTL_LIBSTEREO_TYPES_HPP_
+#define _FTL_LIBSTEREO_TYPES_HPP_
+
+enum AggregationDirections {
+	LEFTRIGHT = 1,
+	RIGHTLEFT = 2,
+	HORIZONTAL = 1+2,
+	UPDOWN = 4,
+	DOWNUP = 8,
+	VERTICAL = 4+8,
+	TOPLEFTBOTTOMRIGHT = 16,
+	BOTTOMRIGHTTOPLEFT = 32,
+	DIAGONAL1 = 16+32,
+	BOTTOMLEFTTOPRIGHT = 64,
+	TOPRIGHTBOTTOMLEFT = 128,
+	DIAGONAL2 = 64+128,
+	DIAGONAL = DIAGONAL1+DIAGONAL2,
+	ALL = HORIZONTAL+VERTICAL+DIAGONAL,
+};
+
+#endif
diff --git a/lib/libstereo/middlebury/CMakeLists.txt b/lib/libstereo/middlebury/CMakeLists.txt
new file mode 100644
index 000000000..3f5632c35
--- /dev/null
+++ b/lib/libstereo/middlebury/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable (middlebury
+                main.cpp
+                middlebury.cpp
+)
+
+target_include_directories(middlebury PRIVATE ../include/)
+target_include_directories(middlebury PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(middlebury libstereo pthread dl ${OpenCV_LIBS})
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
new file mode 100644
index 000000000..6821cacef
--- /dev/null
+++ b/lib/libstereo/middlebury/main.cpp
@@ -0,0 +1,159 @@
+#include <opencv2/opencv.hpp>
+
+#include <vector>
+#include <chrono>
+
+#include "middlebury.hpp"
+#include "stereo.hpp"
+
+/**
+ * @param   disp    disparity map
+ * @param   out     output parameter
+ * @param   ndisp   number of disparities
+ *
+ * If ndisp is greater than zero, color mapping is scaled to range [0, ndisp],
+ * otherwise scaling is automatically set to [0, max(disp)].
+ */
+void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
+	double dmin, dmax;
+	cv::minMaxLoc(disp, &dmin, &dmax);
+	cv::Mat dispf, dispc;
+	disp.convertTo(dispf, CV_32FC1);
+	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
+
+	//cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
+	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using cv::Mat;
+using std::vector;
+
+int main(int argc, char* argv[]) {
+	#if USE_GPU
+	std::cout << "GPU VERSION\n";
+	#else
+	std::cout << "CPU VERSION\n";
+	#endif
+
+	Mat imL;
+	Mat imR;
+	Mat gtL;
+	Mat maskL;
+	MiddEvalCalib calib;
+
+	if (argc < 2) {
+		std::cerr << "usage: middlebury [path]\n";
+		return 1;
+	}
+
+	imL = cv::imread(argv[1] + std::string("im0.png"));
+	imR = cv::imread(argv[1] + std::string("im1.png"));
+	gtL = read_pfm(argv[1] + std::string("disp0.pfm"));
+	if (gtL.empty()) {
+		gtL = read_pfm(argv[1] + std::string("disp0GT.pfm"));
+	}
+	//maskL = cv::imread(argv[1] + std::string("mask0nocc.png"), cv::IMREAD_GRAYSCALE);
+	calib = read_calibration(argv[1] + std::string("calib.txt"));
+
+	maskL.create(imL.size(), CV_8UC1);
+	maskL.setTo(cv::Scalar(255));
+
+	int ndisp = calib.vmax - calib.vmin;
+
+	auto stereo = StereoCensusSgm();
+	stereo.params.P1 = 7;
+	stereo.params.P2 = 60;
+
+	stereo.params.d_min = calib.vmin;
+	stereo.params.d_max = calib.vmax;
+	stereo.params.subpixel = 1;
+	stereo.params.debug = true;
+	//stereo.params.paths = AggregationDirections::ALL;
+	//stereo.params.uniqueness = 200;
+
+	int i_max = 1;
+	float t = 4.0f;
+
+	if (imL.empty() || imR.empty() || gtL.empty()) {
+		std::cerr << "can't load image\n";
+		return 1;
+	}
+
+	if (imL.size() != imR.size()) {
+		std::cerr << "images must be same size\n";
+		return 1;
+	}
+
+	Mat disp(imL.size(), CV_32FC1, cv::Scalar(0.0f));
+	//stereo.setPrior(gtL);
+
+	std::cout << "resolution: " << imL.cols << "x" << imL.rows << ", " << ndisp << " disparities [" << calib.vmin << "," << calib.vmax << "]\n";
+
+	std::vector<cv::Mat> disp_color;
+	std::vector<cv::Mat> err_color;
+
+	for (int i = 0; i < i_max; i++) {
+		auto start = std::chrono::high_resolution_clock::now();
+
+		cv::cuda::GpuMat gpu_imL(imL);
+		cv::cuda::GpuMat gpu_imR(imR);
+		cv::cuda::GpuMat gpu_disp(disp);
+		stereo.compute(gpu_imL, gpu_imR, gpu_disp);
+		gpu_disp.download(disp);
+
+		//stereo.compute(imL, imR, disp);
+		//stereo.setPrior(disp);
+		auto stop = std::chrono::high_resolution_clock::now();
+		std::cout	<< "disparity complete in "
+					<< std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count()
+					<< " ms\n";
+
+		MiddEvalResult res;
+
+		std::cout << "TYPE: " << gtL.type() << std::endl;
+
+		for (const float t : {4.0f, 1.0f, 0.5f, 0.25f}) {
+			res = evaluate(disp, gtL, maskL, t);
+			if (i == 0 || i == i_max-1) {
+				printf("%9.2f %%    correct (err < %.2f)\n", 100.0f * res.err_bad, res.threshold);
+				//printf("%9.2f %%    correct (non-occluded, err < %.2f)\n", 100.0f * res.err_bad_nonoccl, res.threshold);
+				printf("%9.2f px   RMSE (all)\n", res.rms_bad);
+				//printf("%9.2f px   RMSE (non-occluded)\n", res.rms_bad_nonoccl);
+			}
+		}
+
+		if (i == 0 || i == i_max-1) {
+			printf("%9.2f %%    total\n", 100.0f * res.err_total);
+			printf("%9.2f %%    non-occluded\n", 100.0f * res.err_nonoccl);
+		}
+
+		Mat &err_color_ = err_color.emplace_back();
+		Mat &disp_color_ = disp_color.emplace_back();
+
+		// visualize error: set missing values at 0 and scale error to [1.0, t+1.0]
+		// errors > t truncate to t
+		Mat err(gtL.size(), CV_32FC1);
+		cv::absdiff(gtL, disp, err);
+		err.setTo(t, err > t);
+		err += 1.0f;
+		err.setTo(0, disp == 0.0);
+		err.convertTo(err, CV_8UC1, 255.0f/(t+1.0f));
+		cv::applyColorMap(err, err_color_, cv::COLORMAP_PLASMA);
+
+		colorize(disp, disp_color_, calib.ndisp);
+	}
+
+	cv::imshow(std::to_string(0) + ": " + "error (err < " + std::to_string(t) + ")", err_color[0]);
+	cv::imshow(std::to_string(0) + ": " + "disparity", disp_color[0]);
+	cv::imshow(std::to_string(i_max-1) + ": " + "error (err < " + std::to_string(t) + ")", err_color[i_max-1]);
+	cv::imshow(std::to_string(i_max-1) + ": " + "disparity", disp_color[i_max-1]);
+
+	Mat gt_color;
+	colorize(gtL, gt_color, calib.ndisp);
+	cv::imshow("gt", gt_color);
+	cv::waitKey();
+
+	return 0;
+}
diff --git a/lib/libstereo/middlebury/middlebury.cpp b/lib/libstereo/middlebury/middlebury.cpp
new file mode 100644
index 000000000..a54b4f153
--- /dev/null
+++ b/lib/libstereo/middlebury/middlebury.cpp
@@ -0,0 +1,171 @@
+#include "middlebury.hpp"
+
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+cv::Mat read_pfm(const std::string &filename) {
+	cv::Mat im;
+	FILE * fp;
+
+	fp = fopen(filename.c_str(), "rb");
+	char buf[32];
+	int fsize;
+	int width;
+	int height;
+	float scale;
+
+	if (fp == NULL) { return im; }
+
+	if (fscanf(fp, "%31s", buf) == 0 || strcmp(buf, "Pf")) { goto cleanup; }
+
+	if (fscanf(fp, "%31s", buf) == 0) { goto cleanup; }
+	width = atoi(buf);
+
+	if (fscanf(fp, "%31s", buf) == 0) { goto cleanup; }
+	height = atoi(buf);
+
+	if (fscanf(fp, " %31s", buf) == 0) { goto cleanup; }
+	scale = atof(buf);
+
+	im.create(height, width, CV_32FC1);
+
+	fseek(fp, 0, SEEK_END);
+	fseek(fp, ftell(fp)-width*height*sizeof(float), SEEK_SET);
+
+	for (int y = 0; y < height; y++) {
+		float* im_ptr = im.ptr<float>(height-y-1);
+		int nread = 0;
+		do {
+			nread += fread(im_ptr+nread, sizeof(float), width-nread, fp);
+			if (ferror(fp)) { goto cleanup; }
+		}
+		while (nread != width);
+	}
+
+	cleanup:
+	fclose(fp);
+	return im;
+}
+
+static void parse_camera_parameters(const std::string &v, MiddEvalCalib &calib) {
+	std::string mat = std::string(v.substr(1, v.size()-2));
+	std::string row;
+
+	std::vector<float> values;
+
+	for (int _ = 0; _ < 3; _++) {
+		size_t pos = mat.find(";");
+		row = mat.substr(0, pos);
+
+		std::istringstream sstr(row);
+		for(std::string val; sstr >> val;) {
+			values.push_back(atof(val.c_str()));
+		}
+
+		mat.erase(0, pos+1);
+	}
+
+	calib.f = values[0];
+	calib.cx = values[2];
+	calib.cy = values[5];
+}
+
+MiddEvalCalib read_calibration(const std::string &filename) {
+	MiddEvalCalib calib;
+	memset(&calib, 0, sizeof(calib));
+
+	std::ifstream f(filename);
+
+	for(std::string line; std::getline(f, line);) {
+		auto m = line.find("=");
+		if (m == std::string::npos) { continue; }
+		auto k = line.substr(0, m);
+		auto v = line.substr(m+1);
+
+		if (k == "baseline") {
+			calib.baseline = atof(v.c_str());
+		}
+		else if (k == "doffs") {
+			calib.doffs = atof(v.c_str());
+		}
+		else if (k == "ndisp") {
+			calib.ndisp = atoi(v.c_str());
+		}
+		else if (k == "vmin") {
+			calib.vmin = atoi(v.c_str());
+		}
+		else if (k == "vmax") {
+			calib.vmax = atoi(v.c_str());
+		}
+		else if (k == "width") {
+			calib.width = atoi(v.c_str());
+		}
+		else if (k == "height") {
+			calib.height = atoi(v.c_str());
+		}
+		else if (k == "cam0") {
+			parse_camera_parameters(v, calib);
+		}
+	}
+
+	return calib;
+}
+
+MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gt, const cv::Mat &mask, float threshold) {
+	MiddEvalResult result {0.0f, 0.0f, 0.0f};
+	result.threshold = threshold;
+
+	cv::Mat diff1(disp.size(), CV_32FC1);
+	cv::Mat diff2;
+
+	cv::absdiff(disp, gt, diff1);
+	diff1.setTo(0.0f, gt != gt); // NaN
+
+	// ...? where do inf values come from; issue in subpixel interpolation?
+	// doesn't seem to happen when interpolation is enabled
+	diff1.setTo(0.0f, gt == INFINITY);
+
+	float n, n_gt, n_bad, err2, err2_bad;
+
+	// errors incl. occluded areas
+	n = countNonZero(disp);
+	n_gt = countNonZero(gt);
+	cv::pow(diff1, 2, diff2);
+	err2 = cv::sum(diff2)[0];
+
+	result.err_total = n/n_gt;
+	result.rms_total = sqrt(err2/n);
+
+	diff2.setTo(0.0f, diff1 > threshold);
+	n_bad = countNonZero(diff1 <= threshold);
+	err2_bad = cv::sum(diff2)[0];
+	result.err_bad = n_bad/n_gt;
+	result.rms_bad = sqrt(err2_bad/n);
+
+	// errors ignoring occlusions (mask)
+	diff1.setTo(0.0f, mask != 255);
+	cv::pow(diff1, 2, diff2);
+
+	cv::Mat tmp;
+	disp.copyTo(tmp);
+	tmp.setTo(0.0f, mask != 255);
+	n = countNonZero(tmp);
+	n_gt = countNonZero(mask == 255);
+	err2 = cv::sum(diff2)[0];
+
+	result.err_nonoccl = n/n_gt;
+	result.rms_nonoccl = sqrt(err2/n);
+
+	diff2.setTo(0.0f, diff1 > threshold);
+	n_bad = countNonZero(diff1 <= threshold) - countNonZero(mask != 255);
+	err2_bad = cv::sum(diff2)[0];
+	result.err_bad_nonoccl = n_bad/n_gt;
+	result.rms_bad_nonoccl = sqrt(err2_bad/n);
+	return result;
+}
diff --git a/lib/libstereo/middlebury/middlebury.hpp b/lib/libstereo/middlebury/middlebury.hpp
new file mode 100644
index 000000000..e2c18e02b
--- /dev/null
+++ b/lib/libstereo/middlebury/middlebury.hpp
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <opencv2/core/mat.hpp>
+
+struct MiddEvalResult {
+	float err_total;   // all pixels
+	float rms_total;   //
+	float err_nonoccl;   // non-masked pixels
+	float rms_nonoccl;   //
+	float err_bad;     // within threshold disparity from correct value
+	float rms_bad;     // RMS for pixels within threshold disparity from correct value
+	float err_bad_nonoccl;
+	float rms_bad_nonoccl;
+	float threshold;
+};
+
+struct MiddEvalCalib {
+	float f;
+	float cx;
+	float cy;
+	float baseline;
+	float doffs;
+	int width;
+	int height;
+	int ndisp;
+	int vmin;
+	int vmax;
+};
+
+MiddEvalCalib read_calibration(const std::string &filename);
+
+MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gt, const cv::Mat &mask, float threshold=1.0f);
+
+cv::Mat read_pfm(const std::string &filename);
diff --git a/lib/libstereo/src/aggregations/standard_sgm.hpp b/lib/libstereo/src/aggregations/standard_sgm.hpp
new file mode 100644
index 000000000..e05d06f2e
--- /dev/null
+++ b/lib/libstereo/src/aggregations/standard_sgm.hpp
@@ -0,0 +1,120 @@
+#ifndef _FTL_LIBSTEREO_AGGREGATIONS_STANDARD_HPP_
+#define _FTL_LIBSTEREO_AGGREGATIONS_STANDARD_HPP_
+
+#include "../dsi.hpp"
+
+namespace ftl {
+namespace stereo {
+namespace aggregations {
+
+template <typename DSIIN>
+struct StandardSGM {
+	typedef typename DSIIN::Type Type;
+	typedef typename DSIIN::Type costtype_t;
+
+	// Provided externally
+	const DSIIN in;
+	const int P1;
+	const int P2;
+
+	// Provided internally
+	typename DisparitySpaceImage<costtype_t>::DataType out;
+	typename DisparitySpaceImage<costtype_t>::DataType previous;
+	typename DisparitySpaceImage<costtype_t>::DataType updates;
+
+	struct PathData : BasePathData<costtype_t> {
+		// Custom path data goes here...
+		costtype_t previous_cost_min;
+		costtype_t *previous;
+		costtype_t *current;
+	};
+
+	struct DirectionData {
+		DisparitySpaceImage<costtype_t> previous;
+		DisparitySpaceImage<costtype_t> updates;
+	};
+
+	/* Initialise buffers for a new path direction. */
+	void direction(DirectionData &data, DisparitySpaceImage<costtype_t> &buffer) {
+		out = buffer.data();
+		data.previous.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		data.updates.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		previous = data.previous.data();
+		updates = data.updates.data();
+	}
+
+	/* Reset buffers to start a new path */
+	__host__ __device__ inline void startPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		data.previous = &previous(0,data.pathix,previous.disp_min);
+		data.current = &updates(0,data.pathix,updates.disp_min);
+
+		for (int d=thread; d<=previous.disp_min; d+=stride) {
+			data.previous[d] = 0;
+		}
+
+		// To ensure all threads have finished clearing the buffer
+		#ifdef __CUDA_ARCH__
+		__syncwarp();
+		#endif
+	}
+
+	__host__ __device__ inline void endPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {}
+
+	/* Main SGM cost aggregation function */
+	__host__ __device__ inline costtype_t calculateCost(ushort2 pixel, int d, costtype_t *previous, int size, costtype_t previous_cost_min) {
+		const costtype_t L_min =
+			min(previous[d],
+				min(costtype_t(previous_cost_min + P2),
+						min(costtype_t(previous[min(d+1,size)] + P1),
+									costtype_t(previous[max(d-1,0)] + P1))
+			)
+		);
+
+		// Note: This clamping to min 0 does make a tiny difference
+		auto cost = L_min + in(pixel.y,pixel.x,d+in.disp_min);
+		return (cost > previous_cost_min) ? cost - previous_cost_min : 0;
+	}
+
+	/* Stride over each disparity and calculate minimum cost */
+	__host__ __device__ inline void operator()(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		#ifndef __CUDA_ARCH__
+		using std::min;
+		using std::max;
+		#endif
+
+		const int d_stop = int(out.disp_max)-int(out.disp_min);
+
+		costtype_t min_cost = 255;
+
+		// For each disparity (striding the threads)
+		for (int d=thread; d<=d_stop; d+=stride) {
+			auto c = calculateCost(pixel, d, data.previous, d_stop, data.previous_cost_min);
+
+			out(pixel.y,pixel.x,d+in.disp_min) += c;
+			data.current[d] = c;
+
+			// Record the smallest disparity cost for this pixel
+			min_cost = (c < min_cost) ? c : min_cost;
+		}
+
+		// WARP Aggregate on GPU only (assumes stride = 1 on CPU)
+		// Min cost from each thread must be combined for overall minimum
+		// Each thread then obtains thread global minimum
+		#ifdef __CUDA_ARCH__
+		min_cost = warpMin(min_cost);
+		#endif
+
+		data.previous_cost_min = min_cost;
+
+		// Swap current and previous cost buffers
+		costtype_t *tmp_ptr = const_cast<costtype_t *>(data.previous);
+		data.previous = data.current;
+		data.current = tmp_ptr;
+	}
+};
+
+}
+}
+}
+
+#endif
diff --git a/lib/libstereo/src/array2d.hpp b/lib/libstereo/src/array2d.hpp
new file mode 100644
index 000000000..f8385ca2a
--- /dev/null
+++ b/lib/libstereo/src/array2d.hpp
@@ -0,0 +1,136 @@
+#ifndef _FTL_LIBSTEREO_ARRAY2D_HPP_
+#define _FTL_LIBSTEREO_ARRAY2D_HPP_
+
+#include "memory.hpp"
+
+template<typename T>
+class Array2D {
+public:
+	Array2D() : width(0), height(0), needs_free_(false) {
+		data_.data = nullptr;
+	}
+
+	Array2D(int w, int h) : width(w), height(h), needs_free_(true) {
+		data_.data = allocateMemory2D<T>(w, h, data_.pitch);
+	}
+
+	explicit Array2D(cv::Mat &m) : needs_free_(false) {
+		#ifdef USE_GPU
+		create(m.cols, m.rows);
+		cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyHostToDevice));
+		#else
+		needs_free_ = false;
+		data_.data = (T*)m.data;
+		data_.pitch = m.step / sizeof(T);
+		width = m.cols;
+		height = m.rows;
+		#endif
+	}
+
+	explicit Array2D(cv::cuda::GpuMat &m) : needs_free_(false) {
+		#ifdef USE_GPU
+		needs_free_ = false;
+		data_.data = (T*)m.data;
+		data_.pitch = m.step / sizeof(T);
+		width = m.cols;
+		height = m.rows;
+		#else
+		create(m.cols, m.rows);
+		cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyDeviceToHost));
+		#endif
+	}
+
+	~Array2D() {
+		free();
+	}
+
+	void free() {
+		if (needs_free_ && data_.data) freeMemory(data_.data);
+	}
+
+	struct Data {
+		__host__ __device__ inline T& operator() (const int y, const int x) {
+			return data[x + y*pitch];
+		}
+
+		__host__ __device__ inline const T& operator() (const int y, const int x) const {
+			return data[x + y*pitch];
+		}
+
+		__host__ __device__ inline T *ptr(const int y) { return &data[y*pitch]; }
+		__host__ __device__ inline const T *ptr(const int y) const { return &data[y*pitch]; }
+
+		uint pitch;
+		T *data;
+	};
+
+	void create(int w, int h) {
+		if (w == width && h == height) return;
+		width = w;
+		height = h;
+		free();
+		needs_free_ = true;
+		data_.data = allocateMemory2D<T>(w, h, data_.pitch);
+	}
+
+	inline Data &data() { return data_; }
+	inline const Data &data() const { return data_; }
+
+	void toMat(cv::Mat &m) {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		gm.download(m);
+		#else
+		m = cv::Mat(height, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	cv::Mat toMat() {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	const cv::Mat toMat() const {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm(height, width, cv::traits::Type<T>::value, (void*)data_.data, size_t(size_t(data_.pitch)*sizeof(T)));
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	void toGpuMat(cv::cuda::GpuMat &m) {
+		#ifdef USE_GPU
+		m = cv::cuda::GpuMat(height, width, cv::traits::Type<T>::value, (void*)data_.data, size_t(size_t(data_.pitch)*sizeof(T)));
+		#else
+		// TODO
+		#endif
+	}
+
+	cv::cuda::GpuMat toGpuMat() {
+		#ifdef USE_GPU
+		return cv::cuda::GpuMat(height, width, cv::traits::Type<T>::value, (void*)data_.data, size_t(size_t(data_.pitch)*sizeof(T)));
+		#else
+		return cv::cuda::GpuMat(height, width, cv::traits::Type<T>::value);
+		#endif
+	}
+
+	int width;
+	int height;
+
+private:
+	Data data_;
+	bool needs_free_;
+};
+
+#endif
diff --git a/lib/libstereo/src/consistency.hpp b/lib/libstereo/src/consistency.hpp
new file mode 100644
index 000000000..aa6208908
--- /dev/null
+++ b/lib/libstereo/src/consistency.hpp
@@ -0,0 +1,36 @@
+#ifndef _FTL_LIBSTEREO_CONSISTENCY_HPP_
+#define _FTL_LIBSTEREO_CONSISTENCY_HPP_
+
+#include "util.hpp"
+#include "array2d.hpp"
+
+namespace algorithms {
+
+	/**
+	 * Left/right disparity consistency check.
+	 */
+	template <typename T>
+	struct ConsistencyCheck {
+		typename Array2D<T>::Data left;
+		const typename Array2D<T>::Data right;
+		const float threshold=1.0f;
+		const float value=0.0f;
+
+		__host__ __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y; y < size.y; y+=stride.y) {
+				T* ptr_l = left.ptr(y);
+				const T* ptr_r = right.ptr(y);
+
+				for (int x = thread.x; x < size.x; x+=stride.x) {
+					const int d = round(ptr_l[x]);
+
+					if ((x-d) < 0 || abs(ptr_l[x] - ptr_r[x-d]) > threshold) {
+						ptr_l[x] = value;
+					}
+				}
+			}
+		}
+	};
+}
+
+#endif
diff --git a/lib/libstereo/src/cost_aggregation.hpp b/lib/libstereo/src/cost_aggregation.hpp
new file mode 100644
index 000000000..d59613a9e
--- /dev/null
+++ b/lib/libstereo/src/cost_aggregation.hpp
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "dsi.hpp"
+#include "util.hpp"
+
+#include <stereo_types.hpp>
+
+template <typename T>
+struct BasePathData {
+	short2 direction;
+	int pathix;
+};
+
+namespace algorithms {
+	template <typename FUNCTOR, int DX, int DY>
+	struct Aggregator {
+		static_assert(DX != 0 || DY != 0, "DX or DY must be non-zero");
+		static_assert(std::is_convertible<typename FUNCTOR::PathData, BasePathData<typename FUNCTOR::Type>>::value, "Path data must derive from BasePathData<T>");
+
+		FUNCTOR f;
+
+		__cuda__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			#ifndef __CUDA_ARCH__
+			using std::min;
+			using std::max;
+			#endif
+
+			// TODO: Use shared memory?
+			typename FUNCTOR::PathData data;
+			data.direction.x = DX;
+			data.direction.y = DY;
+
+			for (int i = thread.y; i < size.y; i+=stride.y) {
+				data.pathix = i;
+				ushort2 pixel;
+
+				if (DY == 0) {
+					pixel.y = i;
+					if (DX > 0) { pixel.x = 0; }
+					else { pixel.x = f.in.width - 1; }
+				}
+				else if (DX == 0) {
+					pixel.x = i;
+					if (DY > 0) { pixel.y = 0; }
+					else { pixel.y = f.in.height - 1; }
+				}
+				else if (DX == DY) {
+					if (DX > 0) {
+						pixel.x = max(0, i - f.in.height);
+						pixel.y = max(0, f.in.height - i);
+					}
+					else {
+						pixel.x = min(f.in.width - 1, f.in.width + f.in.height - i - 2);
+						pixel.y = min(f.in.height - 1, i);
+					}
+				}
+				else {
+					if (DX > 0) {
+						pixel.x = max(0, i - f.in.height + 1);
+						pixel.y = min(f.in.height - 1, i);
+					}
+					else {
+						pixel.x = min(i, f.in.width - 1);
+						pixel.y = max(0, i - f.in.width + 1);
+					}
+				}
+
+				f.startPath(pixel, thread.x, stride.x, data);
+
+				while (pixel.x < f.in.width && pixel.y < f.in.height) {
+					f(pixel, thread.x, stride.x, data);
+					pixel.x += DX;
+					pixel.y += DY;
+				}
+
+				f.endPath(pixel, thread.x, stride.x, data);
+			}
+		}
+	};
+}
+
+template <typename F>
+class PathAggregator {
+	public:
+
+	void queuePath(int id, F &f) {
+		F f1 = f;
+		f1.direction(path_data[id], out);
+
+		switch (id) {
+		case 0	:	parallel1DWarp<algorithms::Aggregator<F, 1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
+		case 1	:	parallel1DWarp<algorithms::Aggregator<F,-1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
+		case 2	:	parallel1DWarp<algorithms::Aggregator<F, 0, 1>>({f1}, f.in.width, f.in.disparityRange()); break;
+		case 3	:	parallel1DWarp<algorithms::Aggregator<F, 0,-1>>({f1}, f.in.width, f.in.disparityRange()); break;
+		case 4	:	parallel1DWarp<algorithms::Aggregator<F, 1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 5	:	parallel1DWarp<algorithms::Aggregator<F,-1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 6	:	parallel1DWarp<algorithms::Aggregator<F, 1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 7	:	parallel1DWarp<algorithms::Aggregator<F,-1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		}
+	}
+
+	DisparitySpaceImage<typename F::Type> &operator()(F &f, int paths) {
+		out.create(f.in.width, f.in.height, f.in.disp_min, f.in.disp_max);
+		out.clear();
+
+		for (int i=0; i<8; ++i) {
+			if (paths & (1 << i)) {
+				queuePath(i, f);
+				//out.add(buffers[i]);
+			}
+		}
+
+		return out;
+	}
+
+	private:
+	DisparitySpaceImage<typename F::Type> out;
+	typename F::DirectionData path_data[8];
+	DisparitySpaceImage<typename F::Type> buffers[8];
+};
diff --git a/lib/libstereo/src/costs/absolute_diff.hpp b/lib/libstereo/src/costs/absolute_diff.hpp
new file mode 100644
index 000000000..460eb42e5
--- /dev/null
+++ b/lib/libstereo/src/costs/absolute_diff.hpp
@@ -0,0 +1,50 @@
+#ifndef _FTL_LIBSTEREO_COSTS_ADBT_HPP_
+#define _FTL_LIBSTEREO_COSTS_ADBT_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	// Basic data class for use on GPU and CPU
+	struct AbsDiffBT : DSImplBase<unsigned short> {
+		typedef unsigned short Type;
+
+		AbsDiffBT(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return 255; }
+			int l_val = l(y,x);
+			int l_val_p = (x > 0) ? l(y,x-1) : l_val;
+			int l_val_n = (x < width-1) ? l(y,x+1) : l_val;
+			int r_val = r(y,x-d);
+
+			// Pseudo BT?
+			return ((r_val > min(l_val_n,l_val) && r_val < max(l_val_n,l_val)) || (r_val > min(l_val_p,l_val) && r_val < max(l_val_p,l_val))) ? 0 : abs(l_val-r_val);
+		}
+
+		Array2D<uchar>::Data l;
+		Array2D<uchar>::Data r;
+	};
+}
+
+class AbsDiffBT : public DSBase<impl::AbsDiffBT> {
+public:
+	typedef impl::AbsDiffBT DataType;
+	typedef unsigned short Type;
+
+	AbsDiffBT() : DSBase<DataType>(0, 0, 0, 0) {};
+	AbsDiffBT(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {};
+
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r) {
+		data().l = l.data();
+		data().r = r.data();
+	}
+
+	static constexpr short COST_MAX = 255;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/census.cu b/lib/libstereo/src/costs/census.cu
new file mode 100644
index 000000000..05d61c3b7
--- /dev/null
+++ b/lib/libstereo/src/costs/census.cu
@@ -0,0 +1,48 @@
+#include "census.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+namespace algorithms {
+	template<int WINX, int WINY>
+	struct CensusTransform {
+		static_assert(WINX*WINY <= 64, "Census window is too large");
+
+		__host__ __device__ inline uint64_t ct_window(const int y, const int x) {
+			uint64_t ct = 0;
+			uchar center = im(y, x);
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int y_ = y + wy;
+					const int x_ = x + wx;
+					ct = ct << 1;
+					ct |= (center < (im(y_,x_)) ? 1 : 0);
+				}
+			}
+			return ct;
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			//for (int y = WINY/2+1; y < size.y - (WINY/2+1); y++) {
+			//	for (int x = WINX/2+1; x < size.x - (WINX/2+1); x++) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					out(y, x) = ct_window(y, x);
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+	};
+}
+
+void CensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::CensusTransform<9,7>>({l.data(), ct_l_.data()}, l.width, l.height);
+	parallel2D<algorithms::CensusTransform<9,7>>({r.data(), ct_r_.data()}, r.width, r.height);
+}
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
new file mode 100644
index 000000000..bc20efd31
--- /dev/null
+++ b/lib/libstereo/src/costs/census.hpp
@@ -0,0 +1,70 @@
+#ifndef _FTL_LIBSTEREO_COSTS_CENSUS_HPP_
+#define _FTL_LIBSTEREO_COSTS_CENSUS_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	struct CensusMatchingCost : DSImplBase<unsigned short> {
+		typedef unsigned short Type;
+
+		CensusMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return 255; } // TODO: use window size
+			const uint64_t bits =
+				ct_l(y, x) ^ ct_r(y, x-d);
+			#if defined(__CUDA_ARCH__)
+				return __popcll(bits);
+			#elif defined(_MSC_VER)
+				return __popcnt64(bits);
+			#elif defined(__GNUC__)
+				return __builtin_popcountl(bits);
+			#else
+				static_assert(false, "unsupported compiler (popcount intrinsic)");
+			#endif
+		}
+
+		Array2D<uint64_t>::Data ct_l;
+		Array2D<uint64_t>::Data ct_r;
+	};
+}
+
+class CensusMatchingCost : public DSBase<impl::CensusMatchingCost> {
+public:
+	typedef impl::CensusMatchingCost DataType;
+	typedef unsigned short Type;
+
+	CensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	CensusMatchingCost(int width, int height, int disp_min, int disp_max, int wwidth, int wheight)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width, height), ct_r_(width,height),
+			//cost_max(wwidth * wheight),
+			ww_(wwidth), wh_(wheight)
+		{
+			if (wwidth != 9 || wheight != 7) {
+				// TODO: window size paramters (hardcoded) in matching_cost.cpp
+				throw std::exception();
+			}
+			if (wwidth*wheight > 64) {
+				throw std::exception(); // todo: dynamic size
+			}
+
+			data().ct_l = ct_l_.data();
+			data().ct_r = ct_r_.data();
+		}
+
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	static constexpr short COST_MAX = 9*7; // TODO: window size paramters (hardcoded) in matching_cost.cpp
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	int ww_;
+	int wh_;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/dual.hpp b/lib/libstereo/src/costs/dual.hpp
new file mode 100644
index 000000000..6ee5d15d2
--- /dev/null
+++ b/lib/libstereo/src/costs/dual.hpp
@@ -0,0 +1,51 @@
+#ifndef _FTL_LIBSTEREO_COSTS_DUAL_HPP_
+#define _FTL_LIBSTEREO_COSTS_DUAL_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	template <typename A, typename B>
+	struct DualCosts : DSImplBase<typename A::Type> {
+		static_assert(std::is_same<typename A::Type, typename B::Type>::value, "Cost types must be the same");
+		typedef unsigned short Type;
+
+		DualCosts(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}),
+			cost_a(w,h,dmin,dmax), cost_b(w,h,dmin,dmax) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			return (cost_a(y, x, d) + cost_b(y, x, d));
+		}
+
+		A cost_a;
+		B cost_b;
+	};
+}
+
+template <typename A, typename B>
+class DualCosts : public DSBase<impl::DualCosts<typename A::DataType,typename B::DataType>> {
+public:
+	typedef impl::DualCosts<typename A::DataType,typename B::DataType> DataType;
+	typedef unsigned short Type;
+
+	//DualCosts() : DSBase<DataType>(0, 0, 0, 0) {};
+	DualCosts(int width, int height, int disp_min, int disp_max, A &a, B &b)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost_a(a), cost_b(b) {};
+
+	void set() {
+		this->data().cost_a = cost_a.data();
+		this->data().cost_b = cost_b.data();
+	}
+
+	static constexpr short COST_MAX = A::COST_MAX + B::COST_MAX;
+
+protected:
+	A &cost_a;
+	B &cost_b;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/gradient.cu b/lib/libstereo/src/costs/gradient.cu
new file mode 100644
index 000000000..05aef3f95
--- /dev/null
+++ b/lib/libstereo/src/costs/gradient.cu
@@ -0,0 +1,28 @@
+#include "gradient.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+void GradientMatchingCostL2::set(const Array2D<uchar>& l, const Array2D<uchar>& r) {
+	// todo: size check
+	#if USE_GPU
+	cv::Mat m;
+	cv::Scharr(l.toMat(), m, CV_16S, 1, 0);
+	l_dx_.toGpuMat().upload(m);
+	cv::Scharr(l.toMat(), m, CV_16S, 0, 1);
+	l_dy_.toGpuMat().upload(m);
+	cv::Scharr(r.toMat(), m, CV_16S, 1, 0);
+	r_dx_.toGpuMat().upload(m);
+	cv::Scharr(r.toMat(), m, CV_16S, 0, 1);
+	l_dy_.toGpuMat().upload(m);
+	#else
+	cv::Scharr(l.toMat(), l_dx_.toMat(), CV_16S, 1, 0);
+	cv::Scharr(l.toMat(), l_dy_.toMat(), CV_16S, 0, 1);
+	cv::Scharr(r.toMat(), r_dx_.toMat(), CV_16S, 1, 0);
+	cv::Scharr(r.toMat(), r_dy_.toMat(), CV_16S, 0, 1);
+	#endif
+}
diff --git a/lib/libstereo/src/costs/gradient.hpp b/lib/libstereo/src/costs/gradient.hpp
new file mode 100644
index 000000000..4e6ea23d8
--- /dev/null
+++ b/lib/libstereo/src/costs/gradient.hpp
@@ -0,0 +1,65 @@
+#ifndef _FTL_LIBSTEREO_COSTS_GRADIENT_HPP_
+#define _FTL_LIBSTEREO_COSTS_GRADIENT_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	// Basic data class for use on GPU and CPU
+	struct GradientMatchingCostL2 : DSImplBase<unsigned short> {
+		typedef unsigned short Type;
+
+		GradientMatchingCostL2(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return 255; }
+			const short a = l_dx(y,x) - r_dx(y,x-d);
+			const short b = l_dy(y,x) - r_dy(y,x-d);
+			#ifdef USE_GPU
+			return sqrtf(a*a + b*b);
+			#else
+			return sqrtf(a*a + b*b);
+			#endif
+		}
+
+		Array2D<short>::Data l_dx;
+		Array2D<short>::Data r_dx;
+		Array2D<short>::Data l_dy;
+		Array2D<short>::Data r_dy;
+	};
+}
+
+class GradientMatchingCostL2 : public DSBase<impl::GradientMatchingCostL2> {
+public:
+	typedef impl::GradientMatchingCostL2 DataType;
+	typedef unsigned short Type;
+
+	GradientMatchingCostL2() : DSBase<DataType>(0, 0, 0, 0) {};
+	GradientMatchingCostL2(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			l_dx_(width, height), l_dy_(width, height), r_dx_(width, height),
+			r_dy_(width, height)
+	{
+		// Copy the pointers etc to the data object
+		data().l_dx = l_dx_.data();
+		data().l_dy = l_dy_.data();
+		data().r_dx = r_dx_.data();
+		data().r_dy = r_dy_.data();
+	};
+
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+
+	// Maximum value calculated from Scharr kernel weights
+	static constexpr short COST_MAX = 255*3 + 255*10 + 255*3;
+
+protected:
+	Array2D<short> l_dx_;
+	Array2D<short> r_dx_;
+	Array2D<short> l_dy_;
+	Array2D<short> r_dy_;
+};
+
+#endif
\ No newline at end of file
diff --git a/lib/libstereo/src/costs/mutual_information.cu b/lib/libstereo/src/costs/mutual_information.cu
new file mode 100644
index 000000000..af32f0c53
--- /dev/null
+++ b/lib/libstereo/src/costs/mutual_information.cu
@@ -0,0 +1,157 @@
+#include "mutual_information.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+/**
+ * Mutual Information matching cost.
+ *
+ * Hirschmüller, H. (2008). Stereo processing by Semiglobal Matching and Mutual
+ * Information. IEEE Transactions on Pattern Analysis and Machine Intelligence,
+ * 30(2), 328–341. https://doi.org/10.1109/TPAMI.2007.1166
+ *
+ * Kim, J., Kolmogorov, V., & Zabih, R. (2003). Visual correspondence using
+ * energy minimization and mutual information. Proceedings of the IEEE
+ * International Conference on Computer Vision.
+ * https://doi.org/10.1109/iccv.2003.1238463
+ */
+
+/** Calculates intensity distribution. Assumes input in 8 bit grayscale. */
+void calculate_intensity_distribution(const cv::Mat &im1, const cv::Mat &im2, cv::Mat &out, const cv::Mat &mask) {
+
+	if (im1.size() != im2.size()) { throw std::exception(); }
+	if (im1.type() != im2.type()) { throw std::exception(); }
+	if (im1.type() != CV_8UC1) { throw std::exception(); }
+
+	if (!mask.empty() && im1.size() != mask.size()) { throw std::exception(); }
+	if (!mask.empty() && mask.type() != CV_32FC1) { throw std::exception(); }
+
+	const int n = std::numeric_limits<uchar>::max() + 1;
+	out.create(cv::Size(n, n), CV_32FC1);
+	out.setTo(0.0f);
+
+	for (int y = 0; y < im1.rows; y++) {
+		for (int x = 0; x < im1.cols; x++) {
+			if (!mask.empty() && mask.at<float>(y,x) == 0.0f) { continue; }
+
+			const uchar i1 = im1.at<uchar>(y,x);
+			const uchar i2 = im2.at<uchar>(y,x);
+			out.at<float>(i1,i2) += 1.0f;
+			out.at<float>(i2,i1) += 1.0f;
+		}
+	}
+
+	out /= float(im1.total());
+}
+
+/** Sum over columns (data term h1) **/
+void calculate_h1(const cv::Mat &P, cv::Mat &out) {
+	out.create(cv::Size(P.cols, 1), CV_32FC1);
+	for (int x = 0; x < P.cols; x++) {
+		float &sum = out.at<float>(x);
+		sum = 0.0;
+		for (int y = 0; y < P.rows; y++) { sum += P.at<float>(y,x); }
+	}
+}
+
+/** Sum over rows (data term h2) **/
+void calculate_h2(const cv::Mat &P, cv::Mat &out) {
+	out.create(cv::Size(P.rows, 1), CV_32FC1);
+	for (int y = 0; y < P.rows; y++) {
+		float &sum = out.at<float>(y);
+		sum = 0.0;
+		for (int x = 0; x < P.cols; x++) { sum += P.at<float>(y,x); }
+	}
+}
+
+/** Data term h12 */
+void calculate_h12(const cv::Mat &P, cv::Mat &out, int ksize=7, double sigma=1.0) {
+	if (P.empty() || P.type() != CV_32FC1) { throw std::exception(); }
+
+	// possible numerical issues? should also be less than 1/(width*height) of
+	// original image.
+	static float e = 0.00000001;
+	cv::Mat tmp;
+
+	P.copyTo(out);
+	out.setTo(e, P == 0.0f);
+
+	cv::GaussianBlur(out, tmp, cv::Size(ksize,ksize), sigma);
+	cv::log(tmp, tmp);
+	tmp *= -1.0f;
+
+	cv::GaussianBlur(tmp, out, cv::Size(ksize,ksize), sigma);
+}
+
+/** Warp (remap) image using disparity map.
+ *
+ * @param	disparity		disparity map (CV_32FC1)
+ * @param	im				input image
+ * @param	out				result, same type as im
+ * @param	mapx			map for x
+ * @param	mapy			map for y, re-calculates if not provided (mapy.empty() == true)
+ * @param	interpolation	interpolation method, see cv::remap (default linear)
+ * @param	sign			disparity sign
+ */
+void remap_disparity(const cv::Mat &disparity, const cv::Mat &im, cv::Mat &out,
+		cv::Mat &mapx, cv::Mat &mapy, int interpolation=1, int sign=-1) {
+
+	if (mapy.size() != im.size() || mapy.type() != CV_32FC1) {
+		mapy.create(disparity.size(), CV_32FC1);
+		for (int y = 0; y < disparity.rows; y++) {
+			float *ptr = mapy.ptr<float>(y);
+			for (int x = 0; x < disparity.cols; x++) { ptr[x] = float(y); }
+		}
+	}
+
+	mapx.create(disparity.size(), CV_32FC1);
+	for (int y = 0; y < disparity.rows; y++) {
+		float *ptr = mapx.ptr<float>(y);
+		const float *d = disparity.ptr<float>(y);
+		for (int x = 0; x < disparity.cols; x++) {
+			if (d[x] == 0.0f) {
+				ptr[x] = NAN;
+			}
+			else {
+				ptr[x] = float(x) + d[x]*float(sign);
+			}
+		}
+	}
+
+	cv::remap(im, out, mapx, mapy, interpolation, cv::BORDER_CONSTANT);
+}
+
+void MutualInformationMatchingCost::set(const cv::Mat &l, const cv::Mat &r, const cv::Mat &disparity) {
+	cv::Mat P;
+
+	if (l.type() != r.type()) { throw std::exception(); }
+	remap_disparity(disparity, r, r_warp_, mapx_, mapy_);
+
+	mat2gray(l, l_);
+	mat2gray(r, r_);
+	cv::cvtColor(r_warp_, r_warp_, cv::COLOR_BGR2GRAY);
+
+	calculate_intensity_distribution(l_.toMat(), r_warp_, P, disparity);
+
+	cv::Mat h12;
+	cv::Mat h1;
+	cv::Mat h2;
+
+	calculate_h12(P, h12);
+	calculate_h1(P, h1);
+	calculate_h2(P, h2);
+
+	#if USE_GPU
+	h12_.toGpuMat().upload(h12);
+	h1_.toGpuMat().upload(h1);
+	h2_.toGpuMat().upload(h2);
+	#else
+	h12.copyTo(h12_.toMat());
+	h1.copyTo(h1_.toMat());
+	h2.copyTo(h2_.toMat());
+	#endif
+}
diff --git a/lib/libstereo/src/costs/mutual_information.hpp b/lib/libstereo/src/costs/mutual_information.hpp
new file mode 100644
index 000000000..722692ce5
--- /dev/null
+++ b/lib/libstereo/src/costs/mutual_information.hpp
@@ -0,0 +1,65 @@
+#ifndef _FTL_LIBSTEREO_COSTS_MI_HPP_
+#define _FTL_LIBSTEREP_COSTS_MI_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	struct MutualInformationMatchingCost : DSImplBase<unsigned short> {
+		typedef unsigned short Type;
+
+		MutualInformationMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return 24; }
+			const int I1 = l(y,x);
+			const int I2 = r(y,x-d);
+			const float H1 = h1(0,I1);
+			const float H2 = h2(0,I2);
+			const float H12 = h12(I1,I2);
+			return -(H1+H2-H12);
+		}
+
+		Array2D<unsigned char>::Data l;
+		Array2D<unsigned char>::Data r;
+		Array2D<float>::Data h1;
+		Array2D<float>::Data h2;
+		Array2D<float>::Data h12;
+	};
+}
+
+class MutualInformationMatchingCost : public DSBase<impl::MutualInformationMatchingCost>{
+public:
+	typedef impl::MutualInformationMatchingCost DataType;
+	typedef unsigned short Type;
+
+	MutualInformationMatchingCost(int width, int height, int disp_min, int disp_max) :
+		DSBase<DataType>(width, height, disp_min, disp_max),
+		l_(width, height), r_(width, height),
+		h1_(256, 1), h2_(256, 1), h12_(256, 256)
+		{
+			data().l = l_.data();
+			data().r = r_.data();
+			data().h1 = h1_.data();
+			data().h2 = h2_.data();
+			data().h12 = h12_.data();
+		}
+
+	void set(const cv::Mat &left, const cv::Mat &right, const cv::Mat &disparity);
+
+protected:
+	Array2D<unsigned char> l_;
+	Array2D<unsigned char> r_;
+	Array2D<float> h1_;
+	Array2D<float> h2_;
+	Array2D<float> h12_;
+
+	cv::Mat mapx_;
+	cv::Mat mapy_;
+	cv::Mat r_warp_;
+};
+
+#endif
\ No newline at end of file
diff --git a/lib/libstereo/src/dsbase.hpp b/lib/libstereo/src/dsbase.hpp
new file mode 100644
index 000000000..d5f99241d
--- /dev/null
+++ b/lib/libstereo/src/dsbase.hpp
@@ -0,0 +1,64 @@
+#ifndef _FTL_LIBSTEREO_DSBASE_HPP_
+#define _FTL_LIBSTEREO_DSBASE_HPP_
+
+#include <cuda_runtime.h>
+#include <type_traits>
+
+namespace impl {
+	template <typename T>
+	struct DSImplBase {
+		typedef T Type;
+
+		static_assert(std::is_arithmetic<T>::value, "Cost type is not numeric");
+
+		//DSImplBase(int w, int h, int dmin, int dmax) : width(w), height(h), disp_min(dmin), disp_max(dmax) {}
+
+		uint16_t width;
+		uint16_t height;
+		uint16_t disp_min;
+		uint16_t disp_max;
+
+		static constexpr T COST_MAX = std::numeric_limits<T>::max();
+
+		__host__ __device__ inline uint16_t disparityRange() const { return disp_max-disp_min+1; }
+		__host__ __device__ inline uint32_t size() const { return width * disparityRange() * height; }
+		__host__ __device__ inline uint32_t bytes() const { return size() * sizeof(Type); }
+	};
+}
+
+/**
+ * Base class representing a 3D disparity space structure. This may
+ * implement the 3D data structure as a stored memory object or through
+ * dynamic calculation using other data.
+ */
+template<typename T>
+class DSBase {
+	public:
+	typedef T DataType;
+	typedef typename T::Type Type;
+
+#if __cplusplus >= 201703L
+	static_assert(std::is_invocable_r<Type,T,int,int,int>::value, "Incorrect disparity space operator");
+#endif
+	static_assert(std::is_convertible<T, impl::DSImplBase<Type>>::value, "Data type is not derived from DSImplBase");
+
+	DSBase(int w, int h, int dmin, int dmax) : data_(w,h,dmin,dmax) {}
+	~DSBase() {};
+
+	DataType &data() { return data_; };
+	const DataType &data() const { return data_; };
+
+	int width() const { return data_.width; }
+	int height() const { return data_.height; }
+	int numDisparities() const { return data_.disparityRange(); }
+	int maxDisparity() const { return data_.disp_max; }
+	int minDisparity() const { return data_.disp_min; }
+	Type maxCost() const { return DataType::COST_MAX; }
+
+	protected:
+	DataType data_;
+};
+
+//#include "dsbase_impl.hpp"
+
+#endif
diff --git a/lib/libstereo/src/dsbase_impl.hpp b/lib/libstereo/src/dsbase_impl.hpp
new file mode 100644
index 000000000..853d4d700
--- /dev/null
+++ b/lib/libstereo/src/dsbase_impl.hpp
@@ -0,0 +1,5 @@
+template <typename T>
+DSBase::DSBase(int w, int h, int dmin, int dmax)
+	: data_{w,h,dmin,dmax}
+{
+}
diff --git a/lib/libstereo/src/dsi.hpp b/lib/libstereo/src/dsi.hpp
new file mode 100644
index 000000000..62b2754c9
--- /dev/null
+++ b/lib/libstereo/src/dsi.hpp
@@ -0,0 +1,74 @@
+#ifndef _FTL_LIBSTEREO_DSI_HPP_
+#define _FTL_LIBSTEREO_DSI_HPP_
+
+#include "dsbase.hpp"
+#include "memory.hpp"
+#include <opencv2/core/mat.hpp>
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+namespace impl {
+	// Basic data class for use on GPU and CPU
+	template <typename T>
+	struct DSI : DSImplBase<T> {
+		typedef T Type;
+
+		DSI() : data(nullptr) {}
+		DSI(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<T>({w,h,dmin,dmax}), data(nullptr) {}
+
+		__host__ __device__ inline T& operator() (const int y, const int x, const int d) {
+			return data[d-this->disp_min + x*this->disparityRange() + y*this->width*this->disparityRange()];
+		}
+
+		__host__ __device__ inline const T& operator() (const int y, const int x, const int d) const {
+			return data[d-this->disp_min + x*this->disparityRange() + y*this->width*this->disparityRange()];
+		}
+
+		uint pitch;
+		T *data;
+	};
+}
+
+template<typename T>
+class DisparitySpaceImage : public DSBase<impl::DSI<T>> {
+public:
+	typedef impl::DSI<T> DataType;
+	typedef T Type;
+
+	DisparitySpaceImage();
+	DisparitySpaceImage(int width, int height, int disp_min, int disp_max);
+	~DisparitySpaceImage();
+
+	DisparitySpaceImage(const DisparitySpaceImage<T> &)=delete;
+
+	T operator()(int y, int x, int d) const;
+
+	void create(int w, int h, int dmin, int dmax);
+
+	void clear();
+
+	void set(const T& v);
+
+	template<typename A>
+	void add(const A &other, double scale=1.0);
+
+	void mul(double a);
+
+	cv::Mat Mat();
+
+private:
+	//DataType data_;
+	bool needs_free_;
+};
+
+#include "dsi_impl.hpp"
+
+// Common types
+typedef DisparitySpaceImage<unsigned char> DSImage8U;
+typedef DisparitySpaceImage<unsigned short> DSImage16U;
+typedef DisparitySpaceImage<unsigned int> DSImage32U;
+typedef DisparitySpaceImage<float> DSImageFloat;
+
+#endif
diff --git a/lib/libstereo/src/dsi_impl.hpp b/lib/libstereo/src/dsi_impl.hpp
new file mode 100644
index 000000000..5fcd38147
--- /dev/null
+++ b/lib/libstereo/src/dsi_impl.hpp
@@ -0,0 +1,118 @@
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+
+template <typename T>
+DisparitySpaceImage<T>::DisparitySpaceImage() : DSBase<DisparitySpaceImage<T>::DataType>(0,0,0,0) {
+	this->data_.data = nullptr;
+	needs_free_ = false;
+}
+
+template <typename T>
+DisparitySpaceImage<T>::DisparitySpaceImage(int width, int height, int disp_min, int disp_max)
+	: DSBase<DisparitySpaceImage<T>::DataType>(width, height, disp_min, disp_max)
+{
+	this->data_.data = allocateMemory2D<T>(width*(disp_max-disp_min+1), height, this->data_.pitch);
+	needs_free_ = true;
+}
+
+template <typename T>
+DisparitySpaceImage<T>::~DisparitySpaceImage()
+{
+	if (needs_free_ && this->data_.data) freeMemory(this->data_.data);
+}
+
+template <typename T>
+void DisparitySpaceImage<T>::create(int w, int h, int dmin, int dmax) {
+	if (this->data_.data && this->data_.width == w && this->data_.height == h) return;
+	if (needs_free_) freeMemory(this->data_.data);
+	needs_free_ = true;
+	this->data_.width = w;
+	this->data_.height = h;
+	this->data_.disp_min = dmin;
+	this->data_.disp_max = dmax;
+	this->data_.data = allocateMemory2D<T>(w*(dmax-dmin+1), h, this->data_.pitch);
+}
+
+template <typename T>
+void DisparitySpaceImage<T>::clear() {
+#ifdef USE_GPU
+	//if (this->width() == 1 || this->height() == 1) {
+		cudaSafeCall(cudaMemset(this->data_.data, 0, this->data_.pitch*sizeof(T)*this->data_.height));
+	//} else {
+	//	cudaSafeCall(cudaMemset2D(this->data_.data, this->data_.pitch*sizeof(T), 0, this->data_.width*this->numDisparities()*sizeof(T), this->data_.height));
+	//}
+#else
+	memset(this->data_.data, 0, this->data_.pitch*sizeof(T)*this->data_.height);
+#endif
+}
+
+template <typename T>
+T DisparitySpaceImage<T>::operator()(int y, int x, int d) const {
+#ifdef USE_GPU
+	T t;
+	cudaMemcpy(&t, &this->data_(y,x,d), sizeof(T), cudaMemcpyDeviceToHost);
+	return t;
+#else
+	return this->data_(y,x,d);
+#endif
+}
+
+template <typename T>
+void DisparitySpaceImage<T>::set(const T& v) {
+#ifdef USE_GPU
+	thrust::device_ptr<T> dev_ptr(this->data_.data);
+	thrust::fill(dev_ptr, dev_ptr + this->data_.size(), v);
+#else
+	std::fill(this->data_.data, this->data_.data + this->data_.size(), v);
+#endif
+}
+
+template <typename T>
+template<typename A>
+void DisparitySpaceImage<T>::add(const A &other, double scale) {
+#ifdef USE_GPU
+
+#else
+	#pragma omp parallel for
+	for (int y = 0; y < this->data_.height; y++)
+	for (int x = 0; x < this->data_.width; x++)
+	for (int d = this->data_.disp_min; d <= this->data_.disp_max; d++) {{{
+		this->data_(y,x,d) += other(y,x,d)*scale;
+	}}}
+#endif
+}
+
+template <typename T>
+void DisparitySpaceImage<T>::mul(double a) {
+	std::transform(this->data_.begin(), this->data_.end(), this->data_.begin(), [&a](T &elem) { return elem *= a; });
+}
+
+template <typename T>
+cv::Mat DisparitySpaceImage<T>::Mat() {
+	int cvtype;
+
+	if (std::is_same<char, T>::value) {
+		cvtype = CV_8SC1;
+	}
+	else if (std::is_same<short, T>::value) {
+		cvtype = CV_16SC1;
+	}
+	else if (std::is_same<int, T>::value) {
+		cvtype = CV_32SC1;
+	}
+	else if (std::is_same<float, T>::value) {
+		cvtype = CV_32FC1;
+	}
+	else if (std::is_same<double, T>::value) {
+		cvtype = CV_64FC1;
+	}
+	else {
+		return cv::Mat();
+	}
+
+#ifdef USE_GPU
+
+#else
+	return cv::Mat(this->data_.width*this->data_.height*this->data_.disparityRange(), 1, cvtype, this->data_.data);
+#endif
+}
diff --git a/lib/libstereo/src/memory.hpp b/lib/libstereo/src/memory.hpp
new file mode 100644
index 000000000..924b4bb83
--- /dev/null
+++ b/lib/libstereo/src/memory.hpp
@@ -0,0 +1,54 @@
+#ifndef _FTL_LIBSTEREO_MEMORY_HPP_
+#define _FTL_LIBSTEREO_MEMORY_HPP_
+
+#include <cuda_runtime.h>
+#include <opencv2/core/mat.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#ifdef USE_GPU
+using OpenCVMat = cv::cuda::GpuMat;
+#else
+using OpenCVMat = cv::Mat;
+#endif
+
+template <typename T>
+T *allocateMemory(size_t size) {
+#ifdef USE_GPU
+	T *ptr;
+	cudaSafeCall(cudaMalloc(&ptr, size*sizeof(T)));
+	return ptr;
+#else
+	return new T[size];
+#endif
+}
+
+template <typename T>
+T *allocateMemory2D(size_t width, size_t height, uint &pitch) {
+	if (width == 1 || height == 1) {
+		pitch = width;
+		return allocateMemory<T>((width > height) ? width: height);
+	} else {
+	#ifdef USE_GPU
+		T *ptr;
+		size_t ptmp;
+		cudaSafeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height));
+		pitch = ptmp/sizeof(T);
+		return ptr;
+	#else
+		pitch = width; //*sizeof(T);
+		return new T[width*height];
+	#endif
+	}
+}
+
+template <typename T>
+void freeMemory(T *ptr) {
+#ifdef USE_GPU
+	cudaSafeCall(cudaFree(ptr));
+#else
+	delete [] ptr;
+#endif
+}
+
+#endif
diff --git a/lib/libstereo/src/stereo_adcensussgm.cu b/lib/libstereo/src/stereo_adcensussgm.cu
new file mode 100644
index 000000000..9cd9838cf
--- /dev/null
+++ b/lib/libstereo/src/stereo_adcensussgm.cu
@@ -0,0 +1,147 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/absolute_diff.hpp"
+#include "costs/census.hpp"
+#include "costs/dual.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+static int ct_windows_w = 9;
+static int ct_windows_h = 7;
+
+struct StereoADCensusSgm::Impl {
+	DisparitySpaceImage<unsigned short> dsi;
+	AbsDiffBT ad_cost;
+	CensusMatchingCost census_cost;
+	DualCosts<AbsDiffBT,CensusMatchingCost> cost;
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	PathAggregator<StandardSGM<typename DualCosts<AbsDiffBT,CensusMatchingCost>::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		dsi(width, height, min_disp, max_disp),
+		ad_cost(width, height, min_disp, max_disp),
+		census_cost(width, height, min_disp, max_disp, 9, 7),
+		cost(width, height, min_disp, max_disp, ad_cost, census_cost),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoADCensusSgm::StereoADCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoADCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->dsi.height() || r.cols() != impl_->dsi.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	impl_->dsi.clear();
+	impl_->uncertainty.toMat().setTo(0);
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	timer_set();
+
+	// CT
+	impl_->census_cost.set(impl_->l, impl_->r);
+	impl_->ad_cost.set(impl_->l, impl_->r);
+	impl_->cost.set();
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("census transform"); }
+
+	// cost aggregation
+	StandardSGM<DualCosts<AbsDiffBT,CensusMatchingCost>::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+	if (disparity.isGpuMat()) {
+		impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef());
+	}
+	else {
+		cv::Mat &disparity_ = disparity.getMatRef();
+		impl_->wta.disparity.toMat(disparity_);
+		cv::medianBlur(disparity_, disparity_, 3);
+	}
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	//cv::Mat uncertainty;
+	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
+	// confidence threshold
+	// TODO: estimate confidence from uncertainty and plot ROC curve.
+	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+}
+
+StereoADCensusSgm::~StereoADCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_adsgm.cu b/lib/libstereo/src/stereo_adsgm.cu
new file mode 100644
index 000000000..a79abcf34
--- /dev/null
+++ b/lib/libstereo/src/stereo_adsgm.cu
@@ -0,0 +1,138 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/absolute_diff.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+static int ct_windows_w = 9;
+static int ct_windows_h = 7;
+
+struct StereoADSgm::Impl {
+	DisparitySpaceImage<unsigned short> dsi;
+	AbsDiffBT cost;
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	PathAggregator<StandardSGM<typename AbsDiffBT::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		dsi(width, height, min_disp, max_disp),
+		cost(width, height, min_disp, max_disp),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoADSgm::StereoADSgm() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoADSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->dsi.height() || r.cols() != impl_->dsi.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	impl_->dsi.clear();
+	impl_->uncertainty.toMat().setTo(0);
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	timer_set();
+
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("census transform"); }
+
+	// cost aggregation
+	StandardSGM<AbsDiffBT::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+	if (disparity.isGpuMat()) {
+		impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef());
+	}
+	else {
+		cv::Mat &disparity_ = disparity.getMatRef();
+		impl_->wta.disparity.toMat(disparity_);
+		cv::medianBlur(disparity_, disparity_, 3);
+	}
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	//cv::Mat uncertainty;
+	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
+	// confidence threshold
+	// TODO: estimate confidence from uncertainty and plot ROC curve.
+	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+}
+
+StereoADSgm::~StereoADSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_censussgm.cu b/lib/libstereo/src/stereo_censussgm.cu
new file mode 100644
index 000000000..7cb8f635e
--- /dev/null
+++ b/lib/libstereo/src/stereo_censussgm.cu
@@ -0,0 +1,140 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/census.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+static int ct_windows_w = 9;
+static int ct_windows_h = 7;
+
+struct StereoCensusSgm::Impl {
+	//DisparitySpaceImage<unsigned short> dsi;
+	CensusMatchingCost cost;
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	PathAggregator<StandardSGM<CensusMatchingCost::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		//dsi(width, height, min_disp, max_disp),
+		cost(width, height, min_disp, max_disp, ct_windows_w, ct_windows_h),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	//impl_->dsi.clear();
+	impl_->uncertainty.toMat().setTo(0);
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	timer_set();
+
+	// CT
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("census transform"); }
+
+	// cost aggregation
+	StandardSGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	if (disparity.isGpuMat()) {
+		impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef());
+	}
+	else {
+		cv::Mat &disparity_ = disparity.getMatRef();
+		impl_->wta.disparity.toMat(disparity_);
+		cv::medianBlur(disparity_, disparity_, 3);
+	}
+
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	//cv::Mat uncertainty;
+	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
+	// confidence threshold
+	// TODO: estimate confidence from uncertainty and plot ROC curve.
+	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+}
+
+StereoCensusSgm::~StereoCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_gradientstree.cu b/lib/libstereo/src/stereo_gradientstree.cu
new file mode 100644
index 000000000..74ec61ed8
--- /dev/null
+++ b/lib/libstereo/src/stereo_gradientstree.cu
@@ -0,0 +1,141 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/gradient.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+static int ct_windows_w = 9;
+static int ct_windows_h = 7;
+
+struct StereoGradientStree::Impl {
+	GradientMatchingCostL2 cost;
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	PathAggregator<StandardSGM<GradientMatchingCostL2::DataType>> aggr1;
+	PathAggregator<StandardSGM<DisparitySpaceImage<unsigned short>::DataType>> aggr2;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoGradientStree::StereoGradientStree() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoGradientStree::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	impl_->uncertainty.toMat().setTo(0);
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	timer_set();
+
+	impl_->cost.set(impl_->l, impl_->r);
+	cudaSafeCall(cudaDeviceSynchronize());
+
+	//AggregationParameters aggr_params = {impl_->cost_min_paths.data(), params};
+	StandardSGM<GradientMatchingCostL2::DataType> func1 = {impl_->cost.data(), params.P1, params.P2};
+	auto &out1 = impl_->aggr1(func1, AggregationDirections::HORIZONTAL);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation 1"); }
+
+	StandardSGM<DisparitySpaceImage<unsigned short>::DataType> func2 = { out1.data(), params.P1, params.P2};
+	auto &out2 = impl_->aggr2(func2, AggregationDirections::VERTICAL);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation 2"); }
+
+	impl_->wta(out2, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	if (disparity.isGpuMat()) {
+		impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef());
+	}
+	else {
+		cv::Mat &disparity_ = disparity.getMatRef();
+		impl_->wta.disparity.toMat(disparity_);
+		cv::medianBlur(disparity_, disparity_, 3);
+	}
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	//cv::Mat uncertainty;
+	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
+	// confidence threshold
+	// TODO: estimate confidence from uncertainty and plot ROC curve.
+	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+}
+
+StereoGradientStree::~StereoGradientStree() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_misgm.cu b/lib/libstereo/src/stereo_misgm.cu
new file mode 100644
index 000000000..aebc09a65
--- /dev/null
+++ b/lib/libstereo/src/stereo_misgm.cu
@@ -0,0 +1,152 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/mutual_information.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+struct StereoMiSgm::Impl {
+	MutualInformationMatchingCost cost;
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Mat prior; // used only to calculate MI
+
+	PathAggregator<StandardSGM<MutualInformationMatchingCost::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoMiSgm::StereoMiSgm() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoMiSgm::setPrior(const cv::Mat &prior) {
+	prior.copyTo(impl_->prior);
+}
+
+
+void StereoMiSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	//todo prior
+	if (impl_->prior.empty() || impl_->prior.size() != l.size()) {
+		impl_->prior.create(l.rows(), l.cols(), CV_32FC1);
+		cv::randu(impl_->prior, params.d_min, params.d_max);
+	}
+
+	impl_->uncertainty.toMat().setTo(0);
+
+	timer_set();
+
+	// CT
+	if (l.isMat()) {
+		impl_->cost.set(l.getMat(), r.getMat(), impl_->prior);
+	}
+	else if (l.isGpuMat()) {
+		Mat l_;
+		Mat r_;
+		l.getGpuMat().download(l_);
+		r.getGpuMat().download(r_);
+		impl_->cost.set(l_, r_, impl_->prior);
+	}
+	if (params.debug) { timer_print("mutual information"); }
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	// cost aggregation
+	//AggregationParameters aggr_params = {impl_->cost_min_paths.data(), params};
+	StandardSGM<MutualInformationMatchingCost::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, AggregationDirections::ALL);  // params.paths
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+	if (disparity.isGpuMat()) {
+		impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef());
+	}
+	else {
+		cv::Mat &disparity_ = disparity.getMatRef();
+		impl_->wta.disparity.toMat(disparity_);
+		cv::medianBlur(disparity_, disparity_, 3);
+	}
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	//cv::Mat uncertainty;
+	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
+	// confidence threshold
+	// TODO: estimate confidence from uncertainty and plot ROC curve.
+	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+}
+
+StereoMiSgm::~StereoMiSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_sgmp.cpp b/lib/libstereo/src/stereo_sgmp.cpp
new file mode 100644
index 000000000..22c6082b1
--- /dev/null
+++ b/lib/libstereo/src/stereo_sgmp.cpp
@@ -0,0 +1,394 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include "stereo.hpp"
+#include "matching_cost.hpp"
+#include "stereo_common.hpp"
+#include "dsi.hpp"
+
+#include "cost_aggregation.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+/*
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+*/
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+
+static int ct_windows_w = 9;
+static int ct_windows_h = 7;
+
+struct StereoCensusSgmP::Impl {
+	DisparitySpaceImage<unsigned short> dsi;
+	CensusMatchingCost cost;
+	Mat cost_min;
+	Mat cost_min_paths;
+	Mat uncertainty;
+	Mat confidence;
+	Mat disparity_r;
+	Mat prior_disparity;
+	Mat l;
+	Mat r;
+
+	Mat prior;
+	Mat search;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		dsi(width, height, min_disp, max_disp, ct_windows_w*ct_windows_h),
+		cost(width, height, min_disp, max_disp, ct_windows_w, ct_windows_h),
+		cost_min(height, width, CV_16UC1),
+		cost_min_paths(height, width, CV_16UC1),
+		uncertainty(height, width, CV_16UC1),
+		confidence(height, width, CV_32FC1),
+		disparity_r(height, width, CV_32FC1),
+		prior_disparity(height, width, CV_32FC1)
+		{}
+
+	void cvtColor(const cv::Mat &iml, const cv::Mat &imr) {
+		switch (iml.channels()) {
+			case 4:
+				cv::cvtColor(iml, l, cv::COLOR_BGRA2GRAY);
+				break;
+
+			case 3:
+				cv::cvtColor(iml, l, cv::COLOR_BGR2GRAY);
+				break;
+
+			case 1:
+				l = iml;
+				break;
+
+			default:
+				throw std::exception();
+		}
+
+		switch (imr.channels()) {
+			case 4:
+				cv::cvtColor(imr, r, cv::COLOR_BGRA2GRAY);
+				break;
+
+			case 3:
+				cv::cvtColor(imr, r, cv::COLOR_BGR2GRAY);
+				break;
+
+			case 1:
+				r = imr;
+				break;
+
+			default:
+				throw std::exception();
+		}
+	}
+};
+
+static void compute_P2(const cv::Mat &prior, cv::Mat &P2) {
+
+}
+
+// prior	CV32_FC1
+// out		CV8_UC2
+//
+// TODO: range could be in depth units to get more meningful bounds
+static void compute_search_range(const cv::Mat &prior, cv::Mat &out, int dmin, int dmax, int range) {
+	out.create(prior.size(), CV_8UC2);
+	out.setTo(0);
+
+	for (int y = 0; y < out.rows; y++) {
+		for (int x = 0; x < out.cols; x ++) {
+			int d = round(prior.at<float>(y,x));
+			auto &v = out.at<cv::Vec2b>(y,x);
+			if ((d != 0) && (d >= dmin) && (d <= dmax)) {
+				v[0] = std::max(dmin, d-range);
+				v[1] = std::min(dmax, d+range);
+			}
+			else {
+				v[0] = dmin;
+				v[1] = dmax;
+			}
+		}
+	}
+}
+
+
+/**
+ * Compute 2D offset image. SGM sweeping direction set with dx and dy.
+ *
+ * Scharstein, D., Taniai, T., & Sinha, S. N. (2018). Semi-global stereo
+ * matching with surface orientation priors. Proceedings - 2017 International
+ * Conference on 3D Vision, 3DV 2017. https://doi.org/10.1109/3DV.2017.00033
+ */
+static void compute_offset_image(const cv::Mat &prior, cv::Mat &out,
+		const int dx, const int dy) {
+
+	if (prior.empty()) { return; }
+
+	out.create(prior.size(), CV_16SC1);
+	out.setTo(0);
+
+	int y_start;
+	int y_stop;
+	int x_start;
+	int x_stop;
+
+	if (dy < 0) {
+		y_start = -dy;
+		y_stop = prior.rows;
+	}
+	else {
+		y_start = 0;
+		y_stop = prior.rows - dy;
+	}
+
+	if (dx < 0) {
+		x_start = -dx;
+		x_stop = prior.cols;
+	}
+	else {
+		x_start = 0;
+		x_stop = prior.cols - dx;
+	}
+
+	for (int y = y_start; y < y_stop; y++) {
+		const float *ptr_prior = prior.ptr<float>(y);
+		const float *ptr_prior_r = prior.ptr<float>(y+dy);
+		short *ptr_out = out.ptr<short>(y);
+
+		for (int x = x_start; x < x_stop; x++) {
+			// TODO types (assumes signed ptr_out and floating point ptr_prior)
+			if (ptr_prior[x] != 0.0 && ptr_prior_r[x+dx] != 0.0) {
+				ptr_out[x] = round(ptr_prior[x] - ptr_prior_r[x+dx]);
+			}
+		}
+	}
+}
+
+struct AggregationParameters {
+	Mat &prior;		//
+	Mat &search;	//	search range for each pixel CV_8UC2
+	Mat &min_cost;
+	const StereoCensusSgmP::Parameters &params;
+};
+
+inline int get_jump(const int y, const int x, const int d, const int dy, const int dx, const cv::Mat &prior) {
+	if (prior.empty()) {
+		return 0;
+	}
+	else if (dx == 1 && dy == 0) {
+		return prior.at<short>(y, x);
+	}
+	else if (dx == -1 && dy == 0) {
+		if (x == prior.cols - 1) { return 0; }
+		return -prior.at<short>(y, x+1);
+	}
+	else if (dx == 0 && dy == 1) {
+		return prior.at<short>(y, x);
+	}
+	else if (dx == 0 && dy == -1) {
+		if (y == prior.rows - 1) { return 0; }
+		return -prior.at<short>(y+1, x);
+	}
+	else if (dx == 1 && dy == 1) {
+		return prior.at<short>(y, x);
+	}
+	else if (dx == -1 && dy == -1) {
+		if (y == prior.rows - 1 || x == prior.cols - 1) { return 0; }
+		return -prior.at<short>(y+1, x+1);
+	}
+	else if (dx == 1 && dy == -1) {
+		return prior.at<short>(y, x);
+	}
+	else if (dx == -1 && dy == 1) {
+		if (y == prior.rows - 1 || x == 0) { return 0; }
+		return -prior.at<short>(y+1, x-1);
+	}
+}
+
+template<typename T=unsigned short>
+inline void aggregate(
+		AggregationData<CensusMatchingCost, DisparitySpaceImage<T>, T> &data,
+		AggregationParameters &params) {
+
+	auto &previous_cost_min = data.previous_cost_min;
+	auto &previous = data.previous;
+	auto &updates = data.updates;
+	auto &out = data.out;
+	const auto &in = data.in;
+	const auto &x = data.x;
+	const auto &y = data.y;
+	const auto &i = data.i;
+	const T P1 = params.params.P1;
+	const T P2 = params.params.P2;
+
+	T cost_min = in.cost_max;
+
+	//int d_start = params.search.at<cv::Vec2b>(y,x)[0];
+	//int d_stop = params.search.at<cv::Vec2b>(y,x)[1];
+	int d_start = in.disp_min;
+	int d_stop =  in.disp_max;
+
+	for (int d = d_start; d <= d_stop; d++) {
+		const int j = get_jump(y, x, d, data.dy, data.dx, params.prior);
+		const int d_j = std::max(std::min(d+j, previous.disp_max), previous.disp_min);
+
+		const T L_min =
+			std::min<T>(previous(0,i,d_j),
+				std::min<T>(T(previous_cost_min + P2),
+						std::min<T>(T(previous(0,i,std::min(d_j+1, previous.disp_max)) + P1),
+									T(previous(0,i,std::max(d_j-1, previous.disp_min)) + P1))
+			)
+		);
+
+		T C = in(y,x,d);
+		T cost_update = L_min + C - previous_cost_min;
+
+		// stop if close to overflow
+		if (cost_update > (std::numeric_limits<T>::max() - T(in.cost_max))) { throw std::exception(); }
+
+		updates(0,i,d) = cost_update;
+		cost_min = cost_update * (cost_update < cost_min) + cost_min * (cost_update >= cost_min);
+	}
+
+	/*const T update_skipped = params.params.P3;
+	for (int d = out.disp_min; d < d_start; d++) {
+		previous(0,i,d) = update_skipped;
+		out(y,x,d) += update_skipped;
+	}*/
+	for (int d = d_start; d <= d_stop; d++) {
+		previous(0,i,d) = updates(0,i,d);
+		out(y,x,d) += updates(0,i,d);
+	}
+	/*for (int d = d_stop+1; d <= out.disp_max; d++) {
+		previous(0,i,d) = update_skipped;
+		out(y,x,d) += update_skipped;
+	}*/
+
+	params.min_cost.at<T>(y, x) = cost_min;
+	previous_cost_min = cost_min;
+}
+
+StereoCensusSgmP::StereoCensusSgmP() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoCensusSgmP::setPrior(const cv::Mat &prior) {
+	prior.copyTo(impl_->prior_disparity);
+}
+
+void StereoCensusSgmP::compute(const cv::Mat &l, const cv::Mat &r, cv::Mat disparity) {
+	impl_->dsi.clear();
+	impl_->uncertainty.setTo(0);
+
+	if (l.rows != impl_->dsi.height || r.cols != impl_->dsi.width) {
+		Mat prior = impl_->prior_disparity;
+
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols, l.rows, params.d_min, params.d_max);
+
+		if (prior.size() == l.size()) {
+			impl_->prior_disparity = prior;
+		}
+	}
+
+	impl_->cvtColor(l, r);
+
+	timer_set();
+
+	// CT
+	impl_->cost.setLeft(impl_->l);
+	impl_->cost.setRight(impl_->r);
+
+	if (params.debug) { timer_print("census transform"); }
+
+	// cost aggregation
+	AggregationParameters aggr_params = {impl_->prior, impl_->search, impl_->cost_min_paths, params};
+	compute_search_range(impl_->prior_disparity, impl_->search, params.d_min, params.d_max, params.range);
+
+	if (params.paths & AggregationDirections::HORIZONTAL) {
+		compute_offset_image(impl_->prior_disparity, impl_->prior, 1, 0);
+		aggregate_horizontal_all<unsigned short>(
+			aggregate<unsigned short>, impl_->cost, impl_->dsi, aggr_params);
+		if (params.debug) { timer_print("horizontal aggregation"); }
+	}
+
+	if (params.paths & AggregationDirections::VERTICAL) {
+		compute_offset_image(impl_->prior_disparity, impl_->prior, 0, 1);
+		aggregate_vertical_all<unsigned short>(
+			aggregate<unsigned short>, impl_->cost, impl_->dsi, aggr_params);
+		if (params.debug) { timer_print("vertical aggregation"); }
+	}
+
+	if (params.paths & AggregationDirections::DIAGONAL1) {
+		compute_offset_image(impl_->prior_disparity, impl_->prior, 1, 1);
+		aggregate_diagonal_upper_all<unsigned short>(
+			aggregate<unsigned short>, impl_->cost, impl_->dsi, aggr_params);
+		if (params.debug) { timer_print("upper diagonal aggregation"); }
+	}
+
+	if (params.paths & AggregationDirections::DIAGONAL2) {
+		compute_offset_image(impl_->prior_disparity, impl_->prior, 1, -1);
+		aggregate_diagonal_lower_all<unsigned short>(
+			aggregate<unsigned short>, impl_->cost, impl_->dsi, aggr_params);
+		if (params.debug) { timer_print("lower diagonal aggregation"); }
+	}
+
+	if (!(params.paths & AggregationDirections::ALL)) {
+		throw std::exception();
+	}
+
+	// wta + consistency
+	wta(impl_->dsi, disparity, impl_->cost_min, params.subpixel);
+	wta_diagonal(impl_->disparity_r, impl_->dsi);
+	consistency_check(disparity, impl_->disparity_r);
+
+	// confidence estimate
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	//Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+	impl_->uncertainty = impl_->cost_min - impl_->cost_min_paths;
+
+	// instead of difference, use ratio
+	cv::divide(impl_->cost_min, impl_->cost_min_paths, impl_->confidence, CV_32FC1);
+
+	// confidence threshold
+	disparity.setTo(0.0f, impl_->uncertainty > params.uniqueness);
+
+	cv::medianBlur(disparity, disparity, 3);
+}
+
+StereoCensusSgmP::~StereoCensusSgmP() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/util.hpp b/lib/libstereo/src/util.hpp
new file mode 100644
index 000000000..4c50f082b
--- /dev/null
+++ b/lib/libstereo/src/util.hpp
@@ -0,0 +1,135 @@
+#ifndef _FTL_LIBSTEREO_UTIL_HPP_
+#define _FTL_LIBSTEREO_UTIL_HPP_
+
+#include <cuda_runtime.h>
+
+#ifdef USE_GPU
+#define __cuda__ __host__ __device__
+#else
+#define __cuda__
+#endif
+
+static constexpr int WARP_SIZE = 32;
+static constexpr unsigned int FULL_MASK = 0xFFFFFFFF;
+
+template <typename T>
+__device__ inline T warpMin(T e) {
+	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = min(e, other);
+	}
+	return e;
+}
+
+#ifdef USE_GPU
+
+template <typename FUNCTOR>
+__global__ void kernel2D(FUNCTOR f, ushort2 size) {
+	const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)};
+	const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)};
+	f(thread, stride, size);
+}
+
+template <typename FUNCTOR>
+__global__ void kernel1D(FUNCTOR f, ushort size) {
+	const ushort thread = threadIdx.x+blockIdx.x*blockDim.x;
+	const ushort stride = blockDim.x * gridDim.x;
+	f(thread, stride, size);
+}
+
+#endif
+
+/**
+ * Wrapper to initiate a parallel processing job using a given functor. The
+ * width and height parameters are used to determine the number of threads.
+ */
+template <typename FUNCTOR>
+void parallel1D(const FUNCTOR &f, int size) {
+#if __cplusplus >= 201703L
+	static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)");
+#endif
+
+#ifdef USE_GPU
+	cudaSafeCall(cudaGetLastError());
+	kernel1D<<<64, 256, 0, 0>>>(f,size);
+	cudaSafeCall(cudaGetLastError());
+	//cudaSafeCall(cudaDeviceSynchronize());
+#else
+	static constexpr int MAX_THREADS=32;
+	int stride = MAX_THREADS;
+	FUNCTOR f2 = f;  // Make a copy
+	#pragma omp parallel for
+	for (int i=0; i<MAX_THREADS; ++i) {
+		f2(i, stride, size);
+	}
+#endif
+}
+
+
+/**
+ * Wrapper to initiate a parallel processing job using a given functor. The
+ * width and height parameters are used to determine the number of threads.
+ */
+template <typename FUNCTOR>
+void parallel1DWarp(const FUNCTOR &f, int size, int size2) {
+#if __cplusplus >= 201703L
+	//static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)");
+	// Is this static_assert correct?
+	static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)");
+#endif
+
+#ifdef USE_GPU
+	cudaSafeCall(cudaGetLastError());
+	const dim3 gridSize(1, (size+8-1) / 8);
+	const dim3 blockSize(32, 8);
+	ushort2 tsize{ushort(size2), ushort(size)};
+	kernel2D<<<gridSize, blockSize, 0, 0>>>(f,tsize);
+	cudaSafeCall(cudaGetLastError());
+	//cudaSafeCall(cudaDeviceSynchronize());
+#else
+	static constexpr int MAX_THREADS=32;
+	FUNCTOR f2 = f;  // Make a copy
+	#pragma omp parallel for
+	for (int i=0; i<MAX_THREADS; ++i) {
+		ushort2 thread{0,ushort(i)};
+		ushort2 stride{1, MAX_THREADS}; //(height+MAX_THREADS-1)/MAX_THREADS};
+		ushort2 tsize{ushort(size2), ushort(size)};
+		f2(thread, stride, tsize);
+	}
+#endif
+}
+
+/**
+ * Wrapper to initiate a parallel processing job using a given functor. The
+ * width and height parameters are used to determine the number of threads.
+ */
+template <typename FUNCTOR>
+void parallel2D(const FUNCTOR &f, int width, int height) {
+	//static_assert(std::is_function<FUNCTOR>::value, "Parallel2D requires a valid functor");
+#if __cplusplus >= 201703L
+	static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel2D requires a valid functor: void()(ushort2,ushort2,ushort2)");
+#endif
+	assert(width > 0 && width <= 4096);
+	assert(height > 0 && height <= 4096);
+
+#ifdef USE_GPU
+	static constexpr uint T_PER_BLOCK=8;
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	ushort2 size{ushort(width), ushort(height)};
+	kernel2D<<<gridSize, blockSize, 0, 0>>>(f,size);
+	cudaSafeCall(cudaGetLastError());
+#else
+	static constexpr int MAX_THREADS=32;
+	FUNCTOR f2 = f;  // Make a copy
+	#pragma omp parallel for
+	for (int i=0; i<MAX_THREADS; ++i) {
+		ushort2 thread{0,ushort(i)};
+		ushort2 stride{1, MAX_THREADS}; //(height+MAX_THREADS-1)/MAX_THREADS};
+		ushort2 size{ushort(width), ushort(height)};
+		f2(thread, stride, size);
+	}
+#endif
+}
+
+#endif
diff --git a/lib/libstereo/src/util_opencv.hpp b/lib/libstereo/src/util_opencv.hpp
new file mode 100644
index 000000000..94e9db6fe
--- /dev/null
+++ b/lib/libstereo/src/util_opencv.hpp
@@ -0,0 +1,153 @@
+#pragma once
+
+#include "array2d.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#if USE_GPU
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/cudaimgproc.hpp>
+
+static void mat2gray(const cv::cuda::GpuMat &in, Array2D<unsigned char> &out) {
+	if (in.depth() != CV_8U) { throw std::exception(); }
+	if (out.width != in.cols || out.height != in.rows) { throw std::exception(); }
+
+	switch (in.channels()) {
+		case 4:
+			cv::cuda::cvtColor(in, out.toGpuMat(), cv::COLOR_BGRA2GRAY);
+			break;
+
+		case 3:
+			cv::cuda::cvtColor(in, out.toGpuMat(), cv::COLOR_BGR2GRAY);
+			break;
+
+		case 1:
+			in.copyTo(out.toGpuMat());
+			break;
+
+		default:
+			throw std::exception();
+	}
+}
+#endif
+
+static void mat2gray(const cv::Mat &in, Array2D<unsigned char> &out) {
+	if (in.depth() != CV_8U) { throw std::exception(); }
+	if (out.width != in.cols || out.height != in.rows) { throw std::exception(); }
+
+#ifndef USE_GPU
+	switch (in.channels()) {
+		case 4:
+			cv::cvtColor(in, out.toMat(), cv::COLOR_BGRA2GRAY);
+			break;
+
+		case 3:
+			cv::cvtColor(in, out.toMat(), cv::COLOR_BGR2GRAY);
+			break;
+
+		case 1:
+			in.copyTo(out.toMat());
+			break;
+
+		default:
+			throw std::exception();
+	}
+#else
+	cv::Mat tmp;
+	switch (in.channels()) {
+		case 4:
+			cv::cvtColor(in, tmp, cv::COLOR_BGRA2GRAY);
+			break;
+
+		case 3:
+			cv::cvtColor(in, tmp, cv::COLOR_BGR2GRAY);
+			break;
+
+		case 1:
+			in.copyTo(tmp);
+			break;
+
+		default:
+			throw std::exception();
+	}
+
+	out.toGpuMat().upload(tmp);
+#endif
+}
+
+static void mat2gray(cv::InputArray in, Array2D<unsigned char> &out) {
+	if (in.isGpuMat()) {
+		#if USE_GPU
+		mat2gray(in.getGpuMat(), out);
+		#endif
+	}
+	else if (in.isMat()) {
+		mat2gray(in.getMat(), out);
+	}
+	else {
+		throw std::exception();
+	}
+}
+
+/*
+template<typename T, int channels>
+constexpr int cpp2cvtype() {
+	static_assert(channels > 0 && channels <= 4, "unsupported number of channels");
+
+	if (std::is_same<T, char>::value) {
+		switch(channels) {
+			case 1: return CV_8SC1;
+			case 2: return CV_8SC2;
+			case 3: return CV_8SC3;
+			case 4: return CV_8SC4;
+		}
+	}
+	else if (std::is_same<T, unsigned char>::value) {
+		switch(channels) {
+			case 1: return CV_8SC1;
+			case 2: return CV_8UC2;
+			case 3: return CV_8UC3;
+			case 4: return CV_8UC4;
+		}
+	}
+	else if (std::is_same<T, short>::value) {
+		switch(channels) {
+			case 1: return CV_16SC1;
+			case 2: return CV_16SC2;
+			case 3: return CV_16SC3;
+			case 4: return CV_16SC4;
+		}
+	}
+	else if (std::is_same<T, unsigned short>::value) {
+		switch(channels) {
+			case 1: return CV_16UC1;
+			case 2: return CV_16UC2;
+			case 3: return CV_16UC3;
+			case 4: return CV_16UC4;
+		}
+	}
+
+	else if (std::is_same<T, float>::value) {
+		switch(channels) {
+			case 1: return CV_32FC1;
+			case 2: return CV_32FC2;
+			case 3: return CV_32FC3;
+			case 4: return CV_32FC4;
+		}
+	}
+	else if (std::is_same<T, double>::value) {
+		switch(channels) {
+			case 1: return CV_64FC1;
+			case 2: return CV_64FC2;
+			case 3: return CV_64FC3;
+			case 4: return CV_64FC4;
+		}
+	}
+	else {
+		// ideally should be compile time error
+		throw std::logic_error("no matching OpenCV type");
+	}
+
+	return -1;
+}*/
diff --git a/lib/libstereo/src/wta.hpp b/lib/libstereo/src/wta.hpp
new file mode 100644
index 000000000..679638c9e
--- /dev/null
+++ b/lib/libstereo/src/wta.hpp
@@ -0,0 +1,199 @@
+#pragma once
+#include "array2d.hpp"
+#include "util.hpp"
+#include "consistency.hpp"
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+
+namespace algorithms {
+	template<typename T>
+	__cuda__ inline float subpixel_estimate_parabola(const T &dsi, const int x, const int y, const int d) {
+		if (d == dsi.disp_max || d == dsi.disp_min) { return d; }
+
+		const float val_l = dsi(y,x,d-1);
+		const float val_r = dsi(y,x,d+1);
+		const float val = dsi(y,x,d);
+		const float a = val_r - val_l;
+		const float b = 2.0f * val - val_l - val_r;
+		if (b != 0.0f) { return d + 0.5f *a/b; }
+		else { return d; }
+	}
+	template<typename T>
+	__cuda__ inline float subpixel_estimate_equiangular(const T &dsi, const int x, const int y, const int d) {
+		#ifndef __CUDA_ARCH__
+		using std::max;
+		#endif
+
+		if (d == dsi.disp_max || d == dsi.disp_min) { return d; }
+
+		const float val_l = dsi(y,x,d-1);
+		const float val_r = dsi(y,x,d+1);
+		const float val = dsi(y,x,d);
+		const float a = val_r - val_l;
+		const float b = 2.0f * (val - max(val_l, val_r));
+		if (b != 0.0f) { return d + a/b; }
+		else { return d; }
+	}
+
+	template<typename T>
+	__cuda__ inline float subpixel_estimate_sortsgm(const T &dsi, const int x, const int y, const int d) {
+		if (d == dsi.disp_max || d == dsi.disp_min) { return d; }
+
+		/**
+		* Eq. 20 in
+		*
+		* Pantilie, C. D., & Nedevschi, S. (2012). SORT-SGM: Subpixel optimized
+		* real-time semiglobal matching for intelligent vehicles. IEEE Transactions
+		* on Vehicular Technology. https://doi.org/10.1109/TVT.2012.2186836
+		*/
+
+		const float C_l = dsi(y,x,d-1);
+		const float C_r = dsi(y,x,d+1);
+		const float C = dsi(y,x,d);
+
+		const float delta_l = C_l - C;
+		const float delta_r = C_r - C;
+
+		const float rho = (delta_l <= delta_r) ? delta_l/delta_r : delta_r/delta_l;
+		const float g = (sin((M_PI/2.0f)*(rho - 1.0f)) + 1.0f)/2.0f;
+
+		if (delta_l <= delta_r)	{ return float(d) - 0.5f + g; }
+		else					{ return float(d) + 0.5f - g; }
+	}
+
+	template <typename DSI, typename TDisp, int SUBPIXEL>
+	struct WTA {
+		const typename DSI::DataType dsi;
+		typename Array2D<TDisp>::Data disparity;
+		typename Array2D<typename DSI::Type>::Data min_cost;
+
+		__cuda__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			using TCost = typename DSI::Type;
+
+			for (int y = thread.y; y < size.y; y+=stride.y) {
+				auto* ptr_disparity = disparity.ptr(y);
+				auto* ptr_min_cost = min_cost.ptr(y);
+
+				for (int x = 0; x < dsi.width; ++x) {
+					TCost min1 = TCost(-1); //std::numeric_limits<TCost>::max();
+					//TCost min2 = TCost(-1); //std::numeric_limits<TCost>::max();
+					TDisp d = TDisp(0);
+
+					for (int d_ = thread.x + dsi.disp_min; d_ <= dsi.disp_max; d_+=stride.x) {
+						const TCost val = dsi(y,x,d_);
+
+						if (val < min1) {
+							//min2 = min1;
+							min1 = val;
+							d = d_;
+						}
+
+						// next best local minima
+						/*if ((d_ != dsi.disp_min && d != dsi.disp_max) &&
+							(dsi(y,x,d_-1) > val && val < dsi(y,x,d_+1)) &&
+							(d != d_) && (val < min2)) {
+
+							min2 = val;
+						}*/
+
+					}
+
+					#ifdef __CUDA_ARCH__
+					TCost mincost = warpMin(min1);
+					if (min1 != mincost) d = -1;
+					// FIXME: Possible non-determinism, need to choose
+					#endif
+
+					/*if (dsi(y, x, dsi.disp_max) < min2) {
+						min2 = dsi(y, x, dsi.disp_max);
+					}*/
+
+					if (d >= 0) {
+						if (SUBPIXEL == 1) {
+							d = subpixel_estimate_parabola(dsi, x, y, d);
+						}
+						else if (SUBPIXEL == 2) {
+							d = subpixel_estimate_equiangular(dsi, x, y, d);
+						}
+						else if (SUBPIXEL == 3) {
+							d = subpixel_estimate_sortsgm(dsi, x, y, d);
+						}
+
+						ptr_disparity[x] = d;
+						ptr_min_cost[x] =  min1;
+					}
+				}
+			}
+		}
+	};
+
+	/**
+	 * Diagonal WTA (right disparity). No subpixel estimation.
+	 */
+	template <typename DSI, typename TDisp>
+	struct WTADiagonal {
+		const typename DSI::DataType dsi;
+		typename Array2D<TDisp>::Data disparity;
+
+		__cuda__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			using TCost = typename DSI::Type;
+			#ifndef __CUDA_ARCH__
+			using std::min;
+			#endif
+
+			for (int y = thread.y; y < size.y; y+=stride.y) {
+				TDisp* ptr_disparity = disparity.ptr(y);
+
+				for (int x = thread.x; x < size.x; x+=stride.x) {
+					TCost min1 = TCost(-1); //std::numeric_limits<TCost>::max();
+					TDisp d = TDisp(0);
+
+					for (int d_ = dsi.disp_min; d_ < min(dsi.disp_max+1, dsi.width-x); d_++) {
+						const TCost val = dsi(y,x+d_,d_);
+
+						if (val < min1) {
+							min1 = val;
+							d = d_;
+						}
+					}
+
+					ptr_disparity[x] = d;
+				}
+			}
+		}
+	};
+}
+
+template<typename DSI, typename TDisp=float>
+struct WinnerTakesAll {
+	Array2D<TDisp> disparity;
+	Array2D<TDisp> disparity_right;
+	Array2D<typename DSI::Type> min_cost;
+
+	Array2D<TDisp> &operator()(const DSI& dsi, const int subpixel=0) {
+		disparity.create(dsi.width(), dsi.height());
+		disparity_right.create(dsi.width(), dsi.height());
+		min_cost.create(dsi.width(), dsi.height());
+
+		if (subpixel == 0) {
+			algorithms::WTA<DSI,float,0> wta = {dsi.data(), disparity.data(), min_cost.data()};
+			parallel1DWarp(wta, dsi.height(), dsi.numDisparities());
+		} else if (subpixel == 1) {
+			algorithms::WTA<DSI,float,1> wta = {dsi.data(), disparity.data(), min_cost.data()};
+			parallel1DWarp(wta, dsi.height(), dsi.numDisparities());
+		} else if (subpixel == 2) {
+			algorithms::WTA<DSI,float,2> wta = {dsi.data(), disparity.data(), min_cost.data()};
+			parallel1DWarp(wta, dsi.height(), dsi.numDisparities());
+		} else if (subpixel == 3) {
+			algorithms::WTA<DSI,float,3> wta = {dsi.data(), disparity.data(), min_cost.data()};
+			parallel1DWarp(wta, dsi.height(), dsi.numDisparities());
+		}
+
+		algorithms::WTADiagonal<DSI,float> wtadiag = {dsi.data(), disparity_right.data()};
+		parallel2D(wtadiag, dsi.width(), dsi.height());
+
+		parallel2D<algorithms::ConsistencyCheck<float>>({disparity.data(), disparity_right.data()}, dsi.width(), dsi.height());
+		return disparity;
+	}
+};
diff --git a/lib/libstereo/test/CMakeLists.txt b/lib/libstereo/test/CMakeLists.txt
new file mode 100644
index 000000000..ca8165583
--- /dev/null
+++ b/lib/libstereo/test/CMakeLists.txt
@@ -0,0 +1,65 @@
+#add_library(CatchTest OBJECT ./tests.cpp)
+
+add_executable(dsi_cpu_unit
+$<TARGET_OBJECTS:CatchTest>
+	./dsi_unit.cpp
+)
+target_include_directories(dsi_cpu_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(dsi_cpu_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(dsi_cpu_unit pthread dl ${OpenCV_LIBS})
+
+add_test(DSICPUUnitTest dsi_cpu_unit)
+
+add_executable(dsi_gpu_unit
+$<TARGET_OBJECTS:CatchTest>
+	./dsi_gpu_unit.cu
+)
+target_include_directories(dsi_gpu_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(dsi_gpu_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_compile_definitions(dsi_gpu_unit PUBLIC USE_GPU)
+target_link_libraries(dsi_gpu_unit pthread dl ${OpenCV_LIBS})
+
+add_test(DSIGPUUnitTest dsi_gpu_unit)
+
+add_executable(array2d_unit
+$<TARGET_OBJECTS:CatchTest>
+	./array2d_unit.cpp
+)
+target_include_directories(array2d_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(array2d_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(array2d_unit pthread dl ${OpenCV_LIBS})
+
+add_test(Array2DUnitTest array2d_unit)
+
+add_executable(matching_cost_unit
+$<TARGET_OBJECTS:CatchTest>
+	../src/costs/census.cu
+	../src/costs/gradient.cu
+	../src/costs/mutual_information.cu
+	./matching_cost_unit.cpp
+)
+target_include_directories(matching_cost_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include" "${CMAKE_CURRENT_SOURCE_DIR}/../src")
+target_include_directories(matching_cost_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(matching_cost_unit pthread dl ${OpenCV_LIBS})
+
+add_test(MatchingCostUnitTest matching_cost_unit)
+
+add_executable(aggregation_unit
+$<TARGET_OBJECTS:CatchTest>
+	./aggregation_unit.cpp
+)
+target_include_directories(aggregation_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(aggregation_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(aggregation_unit pthread dl ${OpenCV_LIBS})
+
+add_test(AggregationUnitTest aggregation_unit)
+
+add_executable(wta_unit
+$<TARGET_OBJECTS:CatchTest>
+	./wta_unit.cpp
+)
+target_include_directories(wta_unit PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(wta_unit PUBLIC ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(wta_unit pthread dl ${OpenCV_LIBS})
+
+add_test(WTAUnitTest wta_unit)
diff --git a/lib/libstereo/test/aggregation_unit.cpp b/lib/libstereo/test/aggregation_unit.cpp
new file mode 100644
index 000000000..b589a3367
--- /dev/null
+++ b/lib/libstereo/test/aggregation_unit.cpp
@@ -0,0 +1,276 @@
+#include "catch.hpp"
+#include "../src/dsbase.hpp"
+#include "../src/cost_aggregation.hpp"
+
+#include <atomic>
+#include <iostream>
+
+template <typename DSIIN>
+struct TestAggregateIncrement {
+	typedef typename DSIIN::Type Type;
+	typedef typename DSIIN::Type costtype_t;
+
+	const DSIIN in;
+	typename DisparitySpaceImage<costtype_t>::DataType out;
+
+	struct PathData : BasePathData<costtype_t>  {
+		// No extra data...
+	};
+
+	struct DirectionData {
+
+	};
+
+	void direction(DirectionData &data, DisparitySpaceImage<costtype_t> &buffer) {
+		out = buffer.data();
+	}
+
+	__host__ __device__ inline void startPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {}
+	__host__ __device__ inline void endPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {}
+
+	__host__ __device__ inline void operator()(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		int x_prev = pixel.x - data.direction.x;
+		int y_prev = pixel.y - data.direction.y;
+		costtype_t c = 0;
+
+		if (x_prev >= 0 && y_prev >= 0 && x_prev < out.width && y_prev < out.height) {
+			c = out(y_prev,x_prev,in.disp_min) + 1;
+		}
+
+		const int d_stop = int(out.disp_max)-int(out.disp_min);
+
+		for (int d=thread; d<=d_stop; d+=stride) {
+			// for tests, each value updated exactly once (each path separately)
+			// NOT reliable (threading)
+			if (out(pixel.y,pixel.x,d+in.disp_min) != 0) { throw std::exception(); }
+
+			out(pixel.y,pixel.x,d+in.disp_min) += c;
+		}
+	}
+
+	/* __host__ __device__ void operator()(int x, int y, int i, const int dx, const int dy, PathData &data) {
+		int x_prev = x - dx;
+		int y_prev = y - dy;
+		int c = 0;
+
+		if (x_prev >= 0 && y_prev >= 0 && x_prev < out.width && y_prev < out.height) {
+			c = out(y_prev,x_prev,in.disp_min) + 1;
+		}
+		for (int d=in.disp_min; d <= in.disp_max; d++) {
+			if (out(y,x,d) != 0) { throw std::exception(); }
+			out(y,x,d) += c;
+		}
+	}*/
+};
+
+struct DummyCostImpl : impl::DSImplBase<short> {
+	typedef short Type;
+
+	DummyCostImpl(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<short>({w,h,dmin,dmax}) {}
+
+	__host__ __device__ inline short operator()(const int y, const int x, const int d) const {
+		return 0;
+	}
+};
+
+class DummyCost : public DSBase<DummyCostImpl> {
+public:
+	typedef DummyCostImpl DataType;
+	typedef short Type;
+
+	DummyCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	DummyCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {}
+};
+
+static void test_aggregation_increment(const int w, const int h, const int dmin, const int dmax) {
+	const int size = w*h*(dmax-dmin+1);
+
+	DummyCost dsi(w,h,dmin,dmax);
+	TestAggregateIncrement<DummyCost::DataType> func = {dsi.data()};
+	PathAggregator<TestAggregateIncrement<DummyCost::DataType>> aggr;
+
+	SECTION("Left to right") {
+		auto &out = aggr(func, AggregationDirections::LEFTRIGHT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			auto val = out(y,x,d);
+			valid += (out(y,x,d) == x) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Right to left") {
+		auto &out = aggr(func, AggregationDirections::RIGHTLEFT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == w-1-x) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+
+	SECTION("Top to bottom") {
+		auto &out = aggr(func, AggregationDirections::UPDOWN);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == y) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Bottom to top") {
+		auto &out = aggr(func, AggregationDirections::DOWNUP);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == h-y-1) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Top left to bottom right") {
+		auto &out = aggr(func, AggregationDirections::TOPLEFTBOTTOMRIGHT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == std::min(x,y)) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Bottom right to top left") {
+		auto &out = aggr(func, AggregationDirections::BOTTOMRIGHTTOPLEFT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == std::min(w-1-x,h-1-y)) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Bottom left to top right") {
+		auto &out = aggr(func, AggregationDirections::BOTTOMLEFTTOPRIGHT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == std::min(x,h-1-y)) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+
+	SECTION("Top right to bottom left") {
+		auto &out = aggr(func, AggregationDirections::TOPRIGHTBOTTOMLEFT);
+		int valid = 0;
+
+		for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+		for (int d = dmin; d <= dmax; d++) {
+			valid += (out(y,x,d) == std::min(w-1-x,y)) ? 1 : 0;
+		}}}
+
+		REQUIRE(valid == size);
+	}
+}
+
+TEST_CASE("Increment paths [100x100]", "") {
+	test_aggregation_increment(100, 100, 10, 20);
+}
+
+TEST_CASE("Increment paths [100x200]", "") {
+	test_aggregation_increment(100, 200, 10, 20);
+}
+
+TEST_CASE("Increment paths [200x100]", "") {
+	test_aggregation_increment(200, 100, 10, 20);
+}
+
+/*TEST_CASE("Vertical aggregation", "") {
+	SECTION("Perform an aggregation") {
+		CensusMatchingCost dsi(100,100,10,20,9,7);
+
+		cv::Mat left(100,100, CV_8UC1);
+		cv::Mat right(100,100, CV_8UC1);
+		left.setTo(cv::Scalar(0));
+		right.setTo(cv::Scalar(0));
+		dsi.setLeft(left);
+		dsi.setRight(right);
+
+		DisparitySpaceImage<unsigned short> out(100,100,10,20);
+		out.set(1);
+		DisparitySpaceImage<unsigned short> previous(100,1,10,20);
+
+		Aggregate<CensusMatchingCost, unsigned short> aggr = {0,0, out.data(), previous.data(), dsi.data()};
+		aggregate_vertical_all(aggr);
+
+		REQUIRE( out.data()(50,50,12) == 1 );
+	}
+}
+
+TEST_CASE("Diagonal upper aggregation", "") {
+	SECTION("Perform an aggregation") {
+		CensusMatchingCost dsi(100,100,10,20,9,7);
+
+		cv::Mat left(100,100, CV_8UC1);
+		cv::Mat right(100,100, CV_8UC1);
+		left.setTo(cv::Scalar(0));
+		right.setTo(cv::Scalar(0));
+		dsi.setLeft(left);
+		dsi.setRight(right);
+
+		DisparitySpaceImage<unsigned short> out(100,100,10,20);
+		out.set(1);
+		DisparitySpaceImage<unsigned short> previous(100+100,1,10,20);
+
+		Aggregate<CensusMatchingCost, unsigned short> aggr = {0,0, out.data(), previous.data(), dsi.data()};
+		aggregate_diagonal_upper_all(aggr);
+
+		REQUIRE( out.data()(50,50,12) == 1 );
+	}
+}
+
+TEST_CASE("Diagonal lower aggregation", "") {
+	SECTION("Perform an aggregation") {
+		CensusMatchingCost dsi(100,100,10,20,9,7);
+
+		cv::Mat left(100,100, CV_8UC1);
+		cv::Mat right(100,100, CV_8UC1);
+		left.setTo(cv::Scalar(0));
+		right.setTo(cv::Scalar(0));
+		dsi.setLeft(left);
+		dsi.setRight(right);
+
+		DisparitySpaceImage<unsigned short> out(100,100,10,20);
+		out.set(1);
+		DisparitySpaceImage<unsigned short> previous(100+100,1,10,20);
+
+		Aggregate<CensusMatchingCost, unsigned short> aggr = {0,0, out.data(), previous.data(), dsi.data()};
+		aggregate_diagonal_lower_all(aggr);
+
+		REQUIRE( out.data()(50,50,12) == 1 );
+	}
+}*/
diff --git a/lib/libstereo/test/array2d_unit.cpp b/lib/libstereo/test/array2d_unit.cpp
new file mode 100644
index 000000000..8cc2c385c
--- /dev/null
+++ b/lib/libstereo/test/array2d_unit.cpp
@@ -0,0 +1,11 @@
+#include "catch.hpp"
+#include "../src/array2d.hpp"
+
+TEST_CASE("Array2D", "") {
+	SECTION("Construct a float array2d") {
+		Array2D<float> array(100,100);
+
+		array.data()(5,5) = 677;
+		REQUIRE( array.data()(5,5) == 677 );
+	}
+}
diff --git a/lib/libstereo/test/dsi_gpu_unit.cu b/lib/libstereo/test/dsi_gpu_unit.cu
new file mode 100644
index 000000000..96f42cccb
--- /dev/null
+++ b/lib/libstereo/test/dsi_gpu_unit.cu
@@ -0,0 +1,11 @@
+#include "catch.hpp"
+#include "../src/dsi.hpp"
+
+TEST_CASE("DisparitySpaceImage", "") {
+	SECTION("Construct a ushort DSI") {
+		DisparitySpaceImage<unsigned short> dsi(100,100,10,20);
+
+		dsi.set(677);
+		REQUIRE( dsi(5,5,2) == 677 );
+	}
+}
diff --git a/lib/libstereo/test/dsi_unit.cpp b/lib/libstereo/test/dsi_unit.cpp
new file mode 100644
index 000000000..9ea3dac80
--- /dev/null
+++ b/lib/libstereo/test/dsi_unit.cpp
@@ -0,0 +1,11 @@
+#include "catch.hpp"
+#include "../src/dsi.hpp"
+
+TEST_CASE("DisparitySpaceImage", "") {
+	SECTION("Construct a ushort DSI") {
+		DisparitySpaceImage<unsigned short> dsi(100,100,10,20);
+
+		dsi.data()(5,5,2) = 677;
+		REQUIRE( dsi.data()(5,5,2) == 677 );
+	}
+}
diff --git a/lib/libstereo/test/matching_cost_unit.cpp b/lib/libstereo/test/matching_cost_unit.cpp
new file mode 100644
index 000000000..883d584e0
--- /dev/null
+++ b/lib/libstereo/test/matching_cost_unit.cpp
@@ -0,0 +1,40 @@
+#include "catch.hpp"
+#include "costs/census.hpp"
+#include "costs/gradient.hpp"
+#include "util_opencv.hpp"
+
+TEST_CASE("Gradient Matching Cost", "") {
+	SECTION("Construct a Gradient matching cost") {
+		GradientMatchingCostL2 dsi(100,100,10,20);
+
+		cv::Mat left(100,100, CV_8UC1);
+		cv::Mat right(100,100, CV_8UC1);
+
+		left.setTo(cv::Scalar(0));
+		right.setTo(cv::Scalar(0));
+
+		Array2D<uchar> l(left);
+		Array2D<uchar> r(right);
+		dsi.set(l, r);
+
+		REQUIRE( dsi.data()(5,5,2) == 0 );
+	}
+}
+
+TEST_CASE("Census Matching Cost", "") {
+	SECTION("Construct a Census matching cost") {
+		CensusMatchingCost dsi(100,100,10,20,9,7);
+
+		cv::Mat left(100,100, CV_8UC1);
+		cv::Mat right(100,100, CV_8UC1);
+
+		left.setTo(cv::Scalar(0));
+		right.setTo(cv::Scalar(0));
+
+		Array2D<uchar> l(left);
+		Array2D<uchar> r(right);
+		dsi.set(l, r);
+
+		REQUIRE( dsi.data()(5,5,2) == 0 );
+	}
+}
diff --git a/lib/libstereo/test/tests.cpp b/lib/libstereo/test/tests.cpp
new file mode 100644
index 000000000..178916eab
--- /dev/null
+++ b/lib/libstereo/test/tests.cpp
@@ -0,0 +1,3 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
diff --git a/lib/libstereo/test/wta_unit.cpp b/lib/libstereo/test/wta_unit.cpp
new file mode 100644
index 000000000..ebb845f68
--- /dev/null
+++ b/lib/libstereo/test/wta_unit.cpp
@@ -0,0 +1,72 @@
+#include "catch.hpp"
+#include "../src/dsi.hpp"
+#include "../src/wta.hpp"
+#include "../src/util.hpp"
+#include "../src/consistency.hpp"
+
+using algorithms::WTA;
+using algorithms::WTADiagonal;
+using algorithms::ConsistencyCheck;
+
+TEST_CASE("Winner Takes All", "") {
+	const int w = 100;
+	const int h = 100;
+	const int dmin = 10;
+	const int dmax = 20;
+	const int d_true = 12;
+
+	DSImage16U dsi(w, h, dmin, dmax);
+	Array2D<float> disparity(w, h);
+	Array2D<unsigned short> min_cost(w, h);
+
+	// d(y,x,d) == 0 iff d == d_true, otherwise 100
+	for (int y = 0; y < h; y++) {
+		for (int x = 0; x < w; x++) {
+			for (int d = dmin; d <= dmax; d++) {
+				dsi.data()(y,x,d) = (d == d_true) ? 0 : 100;
+			}
+		}
+	}
+
+	SECTION("WTA from a ushort DSI") {
+		WTA<DSImage16U, float, 0> wta = {dsi.data(), disparity.data(), min_cost.data()};
+		parallel2D(wta, dsi.width(), dsi.height());
+		int valid = 0;
+		for (int y = 0; y < h; y++) {
+			for (int x = 0; x < w; x++) {
+				valid += (float(d_true) == disparity.data()(y,x)) ? 1 : 0;
+			}
+		}
+		REQUIRE(valid == w*h);
+	}
+
+	SECTION("WTA diagonal from a ushort DSI") {
+		WTADiagonal<DSImage16U,float> wta = {dsi.data(), disparity.data()};
+		parallel2D(wta, dsi.width(), dsi.height());
+
+		int valid = 0;
+		for (int y = 0; y < h; y++) {
+			for (int x = 0; x < w; x++) {
+				valid += (float(d_true) == disparity.data()(y,x)) ? 1 : 0;
+			}
+		}
+	}
+
+	SECTION("WTA with consistency check") {
+		WTA<DSImage16U, float, 0> wta = {dsi.data(), disparity.data(), min_cost.data()};
+		parallel2D(wta, dsi.width(), dsi.height());
+
+		Array2D<float> disparityr(100,100);
+
+		WTADiagonal<DSImage16U, float> wtadiag = {dsi.data(), disparityr.data()};
+		parallel2D(wtadiag, dsi.width(), dsi.height());
+
+		parallel2D<ConsistencyCheck<float>>({disparity.data(), disparityr.data()}, dsi.width(), dsi.height());
+	}
+}
+
+TEST_CASE("WTA Wrapper") {
+	WinnerTakesAll<DSImage16U, float> wta;
+	DSImage16U dsi(100,100,10,20);
+	wta(dsi);
+}
-- 
GitLab