diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index 78f367ad5b0e5d7965bf679c704d43ad4b68cccd..64e19cd84f2ec398fad46ab9c1c64e751bbeab17 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -36,34 +36,72 @@ include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 if (LIBSTEREO_SHARED)
     add_library(libstereo SHARED
                 src/stereo_gradientstree.cu
+                src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
-                src/stereo_misgm2.cu
-                src/stereo_censussgm.cu
-                src/stereo_sgmp.cpp
+                src/algorithms/censussgm.cu
+                src/algorithms/excensussgm.cu
+                src/algorithms/stablesgm.cu
+                src/algorithms/tcensussgm.cu
+                src/algorithms/hcensussgm.cu
+                src/algorithms/hwcensussgm.cu
+                src/algorithms/brefcensussgm.cu
+                src/algorithms/meancensussgm.cu
+                src/algorithms/hstablesgm.cu
+                #src/stereo_hier_census.cu
+                src/stereo_wcensussgm.cu
+                src/stereo_census_adaptive.cu
+                src/stereo_cp_censussgm.cu
+                src/stereo_adcensussgm.cu
+                src/stereo_varcensussgm.cu
+                src/stereo_adsgm.cu
+                src/stereo_sad.cu
+                #src/stereo_sgmp.cpp
+                src/stereo_wadcensus.cu
                 src/costs/census.cu
+                src/costs/tcensus.cu
                 src/costs/gradient.cu
+                src/costs/sad.cu
+                src/costs/stable.cu
                 src/costs/mutual_information.cu
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
+                src/dsi_tools.cu
     )
-set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp)
+    set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp)
 
 else()
     add_library(libstereo
                 src/stereo_gradientstree.cu
                 src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
-                src/stereo_misgm2.cu
-                src/stereo_censussgm.cu
+                src/algorithms/censussgm.cu
+                src/algorithms/excensussgm.cu
+                src/algorithms/stablesgm.cu
+                src/algorithms/tcensussgm.cu
+                src/algorithms/hcensussgm.cu
+                src/algorithms/hwcensussgm.cu
+                src/algorithms/brefcensussgm.cu
+                src/algorithms/meancensussgm.cu
+                src/algorithms/hstablesgm.cu
+                #src/stereo_hier_census.cu
+                src/stereo_wcensussgm.cu
                 src/stereo_census_adaptive.cu
+                src/stereo_cp_censussgm.cu
                 src/stereo_adcensussgm.cu
+                src/stereo_varcensussgm.cu
                 src/stereo_adsgm.cu
+                src/stereo_sad.cu
                 #src/stereo_sgmp.cpp
+                src/stereo_wadcensus.cu
                 src/costs/census.cu
+                src/costs/tcensus.cu
                 src/costs/gradient.cu
+                src/costs/sad.cu
+                src/costs/stable.cu
                 src/costs/mutual_information.cu
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
+                src/dsi_tools.cu
     )
 endif()
 
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index 16e0648e93711f6dc4dfdcece8b6d0109d13dd14..c596986b02bf3243f15a4f00db4b919a2fa062db 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -72,6 +72,365 @@ public:
 		unsigned short P2 = 25;
 		float uniqueness = std::numeric_limits<unsigned short>::max();
 		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoExCensusSgm {
+public:
+	StereoExCensusSgm();
+	~StereoExCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.2;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 5;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Mean reference pixel. Doesn't work since detail is lost form reference
+ *
+ * @see "Froba, B., Ernst, A. "Face detection with the modified census transform," In: Sixth IEEE International Conference on Automatic Face and Gesture Recognition. IEEE Computer Society Press, Los Alamitos (2004)."
+ * @see "A robust local census-based stereo matching insensitive to illumination changes" (2012)
+ * @see "A stereo matching algorithm based on four-moded census and relative confidence plane fitting" (2015)
+ * @see "Stereo matching based on improved census transform" (2018)
+ */
+class StereoMeanCensusSgm {
+public:
+	StereoMeanCensusSgm();
+	~StereoMeanCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Bilateral filtering for reference pixel. Doesn't make things better, but
+ * still better than mean value for reference.
+ */
+class StereoBRefCensusSgm {
+public:
+	StereoBRefCensusSgm();
+	~StereoBRefCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * STABLE Binary descriptor. This is a general implementation.
+ * 
+ * @see K. Valentín, R. Huber-Mörk, and S. Štolc, “Binary descriptor-based dense
+ *      line-scan stereo matching,” J. Electron. Imaging, vol. 26, no. 1, 2017.
+ */
+class StereoStableSgm {
+public:
+	StereoStableSgm();
+	~StereoStableSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		short wsize = 17;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Three resolutions, fine, medium and coarse, are combined. In each case the
+ * reference pixel always comes from the fine resolution. Each resolution is
+ * variance weighted for that resolution to increase and decrease influence,
+ * allowing textured high res to dominate when possible.
+ */
+class StereoHStableSgm {
+public:
+	StereoHStableSgm();
+	~StereoHStableSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.2;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 9;
+		bool debug = false;
+		short wsize = 17;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Three resolutions, fine, medium and coarse, are combined. In each case the
+ * reference pixel always comes from the fine resolution. Each resolution is
+ * variance weighted for that resolution to increase and decrease influence,
+ * allowing textured high res to dominate when possible.
+ */
+class StereoHierCensusSgm {
+public:
+	StereoHierCensusSgm();
+	~StereoHierCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.2;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 9;
+		bool debug = false;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Three resolutions, fine, medium and coarse, are combined. In each case the
+ * reference pixel always comes from the fine resolution. Each resolution is
+ * variance weighted for that resolution to increase and decrease influence,
+ * allowing textured high res to dominate when possible.
+ */
+class StereoHierWeightCensusSgm {
+public:
+	StereoHierWeightCensusSgm();
+	~StereoHierWeightCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.2;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 7;
+		bool debug = false;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoCPCensusSgm {
+public:
+	StereoCPCensusSgm();
+	~StereoCPCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+		bool filtercosts = true;
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.5;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 7;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoWCensusSgm {
+public:
+	StereoWCensusSgm();
+	~StereoWCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+/**
+ * Ternary census, or 3 moded, where there is a noise threshold and where
+ * pixels can be identified as no luminance change in addition to above or
+ * below.
+ * 
+ * @see "TEXTURE-AWARE DENSE IMAGE MATCHING USING TERNARY CENSUS TRANSFORM" (2016)
+ * @see "Local disparity estimation with three-moded cross census and advanced support weight" (2013)
+ */
+class StereoTCensusSgm {
+public:
+	StereoTCensusSgm();
+	~StereoTCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		uchar t = 8;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
 		int paths = AggregationDirections::HORIZONTAL |
 					AggregationDirections::VERTICAL |
 					AggregationDirections::DIAGONAL;
@@ -165,6 +524,76 @@ public:
 
 		float alpha = 0.2; /** alpha: minimum weight for census cost */
 		float beta = 0.7; /** 1-beta: minimum weight for MI cost */
+		int var_window = 9;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoWADCensus {
+public:
+	StereoWADCensus();
+	~StereoWADCensus();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		float P1 = 1;
+		float P2 = 16;
+		float l1 = 1.0;
+		float l2 = 1.0;
+		int wsize = 31;
+		float uniqueness = std::numeric_limits<short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+
+		float alpha = 0.2; /** alpha: minimum weight for census cost */
+		float beta = 0.7; /** 1-beta: minimum weight for MI cost */
+		int var_window = 9;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+class StereoVarCensus {
+public:
+	StereoVarCensus();
+	~StereoVarCensus();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		float P1 = 1;
+		float P2 = 16;
+		int wsize = 31;
+		float uniqueness = std::numeric_limits<short>::max();
+		int subpixel = 1; /** subpixel interpolation method */
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+
+		/** normalization of variance to range [alpha, beta] */
+		float alpha = 0.2;
+		float beta = 1.0;
+		/** variance window size */
+		int var_window = 9;
 		bool debug = false;
 	};
 	Parameters params;
@@ -174,6 +603,29 @@ private:
 	Impl *impl_;
 };
 
+class StereoSad {
+public:
+	StereoSad();
+	~StereoSad();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp);
+
+	struct Parameters {
+		int wsize = 63;
+		int d_min = 0;
+		int d_max = 0;
+		int subpixel = 1; // subpixel interpolation method
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+
 /**
  * SGM aggregation on ground truth cost
  */
@@ -204,7 +656,6 @@ private:
 	Impl *impl_;
 };
 
-
 /**
  * Census + SGM + prior
  *
diff --git a/lib/libstereo/include/stereo_types.hpp b/lib/libstereo/include/stereo_types.hpp
index fc57ddcf960f647c2bae5d356a29963d37fdde9d..6eb259b9d447432178e7fb731b127ef592053df3 100644
--- a/lib/libstereo/include/stereo_types.hpp
+++ b/lib/libstereo/include/stereo_types.hpp
@@ -18,4 +18,10 @@ enum AggregationDirections : int {
 	ALL = HORIZONTAL+VERTICAL+DIAGONAL,
 };
 
+enum class CensusPattern {
+	STANDARD,
+	GENERALISED,
+	STAR
+};
+
 #endif
diff --git a/lib/libstereo/middlebury/CMakeLists.txt b/lib/libstereo/middlebury/CMakeLists.txt
index 3f5632c35ffb34bb037bde503328ca5cf7996402..a4e88a69fca1aa10b0fd2ac1a51d32033c01e6bf 100644
--- a/lib/libstereo/middlebury/CMakeLists.txt
+++ b/lib/libstereo/middlebury/CMakeLists.txt
@@ -5,4 +5,4 @@ add_executable (middlebury
 
 target_include_directories(middlebury PRIVATE ../include/)
 target_include_directories(middlebury PUBLIC ${OpenCV_INCLUDE_DIRS})
-target_link_libraries(middlebury libstereo pthread dl ${OpenCV_LIBS})
+target_link_libraries(middlebury libstereo pthread stdc++fs dl ${OpenCV_LIBS})
diff --git a/lib/libstereo/middlebury/algorithms.hpp b/lib/libstereo/middlebury/algorithms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..604f0f57c73529ffa5ae6b88b9a42465f6e51a3f
--- /dev/null
+++ b/lib/libstereo/middlebury/algorithms.hpp
@@ -0,0 +1,358 @@
+#pragma once
+
+#include <map>
+
+#include "middlebury.hpp"
+#include "stereo.hpp"
+
+struct Algorithm {
+	virtual void run(const MiddleburyData &data, cv::Mat &disparity)=0;
+	float P1 = 4.0f;
+	float P2 = 80.0f;
+	bool subpixel = true;
+	bool lr_consistency = true;
+};
+
+namespace Impl {
+
+	struct CensusSGM : public Algorithm {
+		CensusSGM() { P1 = 30.0f; P2 = 132.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct ECensusSGM : public Algorithm {
+		ECensusSGM() { P1 = 8.0f; P2 = 40.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoExCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.alpha = 0.5f;
+			stereo.params.beta = 1.0f;
+			stereo.params.var_window = 7;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct GCensusSGM : public Algorithm {
+		GCensusSGM() { P1 = 20.0f; P2 = 110.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.pattern = CensusPattern::GENERALISED;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct MeanCensusSGM : public Algorithm {
+		MeanCensusSGM() { P1 = 12.0f; P2 = 32.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoMeanCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct StableSGM : public Algorithm {
+		StableSGM() { P1 = 1.0f; P2 = 8.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoStableSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+			stereo.params.wsize = 9;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct HStableSGM : public Algorithm {
+		HStableSGM() { P1 = 3.0f; P2 = 24.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoHStableSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+			stereo.params.wsize = 7;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.params.alpha = 0.5f;
+			stereo.params.beta = 1.0f;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct BRefCensusSGM : public Algorithm {
+		BRefCensusSGM() { P1 = 12.0f; P2 = 32.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoBRefCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct HCensusSGM : public Algorithm {
+		HCensusSGM() { P1 = 6.0f; P2 = 26.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoHierCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+			stereo.params.var_window = 5;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.params.alpha = 0.5f;
+			stereo.params.beta = 1.0f;
+
+			stereo.compute(data.imL, data.imR, disparity);
+
+			/*cv::cuda::GpuMat gdisp;
+			cv::cuda::GpuMat gdisp2;
+			cv::cuda::GpuMat gl;
+			gl.upload(data.imL);
+			gdisp.create(gl.size(), CV_32F);
+			stereo.compute(data.imL, data.imR, gdisp);
+
+			gdisp.convertTo(gdisp2, CV_16S);
+			auto blf = cv::cuda::createDisparityBilateralFilter(stereo.params.d_max-stereo.params.d_min+1,3,10);
+			blf->apply(gdisp2, gl, gdisp2);
+			gdisp2.convertTo(gdisp, CV_32F);
+			gdisp.download(disparity);*/
+		}
+	};
+
+	struct HWCensusSGM : public Algorithm {
+		HWCensusSGM() { P1 = 8.0f; P2 = 38.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoHierWeightCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+			stereo.params.var_window = 5;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.params.alpha = 0.5f;
+			stereo.params.beta = 1.0f;
+
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct HGCensusSGM : public Algorithm {
+		HGCensusSGM() { P1 = 36.0f; P2 = 96.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoHierCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.params.alpha = 0.5f;
+			stereo.params.beta = 1.0f;
+			stereo.params.pattern = CensusPattern::GENERALISED;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct CPCensusSGM : public Algorithm {
+		CPCensusSGM() { P1 = 4.0f; P2 = 60.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			 StereoCPCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.params.debug = false;
+			stereo.params.uniqueness = 20;
+			stereo.params.filtercosts = false;
+			stereo.params.alpha = 0.2f;
+			stereo.params.beta = 1.0f;
+			stereo.params.var_window = 7;
+			for (int i=0; i<10; ++i) {
+				if (i == 10-1) {
+					stereo.params.uniqueness = std::numeric_limits<unsigned short>::max();
+					stereo.params.filtercosts = false;
+				}
+				stereo.compute(data.imL, data.imR, disparity);
+			}
+		}
+	};
+
+	struct TCensusSGM : public Algorithm {
+		TCensusSGM() { P1 = 24.0f; P2 = 64.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoTCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct WCensusSGM : public Algorithm {
+		WCensusSGM() { P1 = 12.0f; P2 = 32.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoWCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct MiSGM : public Algorithm {
+		MiSGM() { P1 = 4.0f; P2 = 18.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoMiSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			//stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+			stereo.setPrior(disparity);
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct WADCensusSGM : public Algorithm {
+		WADCensusSGM() { P1 = 0.2f; P2 = 0.5f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoWADCensus stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			//stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.l1 = 1.0;
+			stereo.params.l2 = 0.0;
+			stereo.params.alpha = 0.4;
+			stereo.params.beta = 1.0;
+			stereo.params.wsize = 9;
+			stereo.params.var_window = 15;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.params.debug = false;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+	struct VarCensusSGM : public Algorithm {
+		VarCensusSGM() { P1 = 12.0f; P2 = 52.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoVarCensus stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.alpha = 0.33;
+			stereo.params.beta = 1.0;
+			stereo.params.var_window = 27;
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
+}
+
+static const std::map<std::string, Algorithm*> algorithms = {
+	{ "censussgm", new Impl::CensusSGM() },
+	{ "mcensussgm", new Impl::MeanCensusSGM() },
+	{ "gcensussgm", new Impl::GCensusSGM() },
+	{ "ecensussgm", new Impl::ECensusSGM() },
+	{ "stablesgm", new Impl::StableSGM() },
+	{ "hstablesgm", new Impl::HStableSGM() },
+	{ "brefcensus", new Impl::BRefCensusSGM() },
+	{ "hgcensussgm", new Impl::HGCensusSGM() },
+	{ "hcensussgm", new Impl::HCensusSGM() },
+	{ "hwcensussgm", new Impl::HWCensusSGM() },
+	{ "cpcensussgm",  new Impl::CPCensusSGM() },
+	{ "tcensussgm",  new Impl::TCensusSGM() },
+	{ "wcensussgm",  new Impl::WCensusSGM() },
+	{ "misgm",  new Impl::MiSGM() },
+	{ "varcensus",  new Impl::VarCensusSGM() },
+};
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
index d1e0529ccbcaa1825b994a7ab06af99795a3278c..75e68a99c0e5c851d5ce1f816c71f7b8994edaaa 100644
--- a/lib/libstereo/middlebury/main.cpp
+++ b/lib/libstereo/middlebury/main.cpp
@@ -1,132 +1,27 @@
+#include <unistd.h>
+
 #include <opencv2/opencv.hpp>
 
 #include <vector>
-#include <map>
+#include <utility>
 #include <chrono>
 
 #include "middlebury.hpp"
-#include "stereo.hpp"
-
-#include "../../components/common/cpp/include/ftl/config.h"
-
-/**
- * @param   disp    disparity map
- * @param   out     output parameter
- * @param   ndisp   number of disparities
- *
- * If ndisp is greater than zero, color mapping is scaled to range [0, ndisp],
- * otherwise scaling is automatically set to [0, max(disp)].
- */
-void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
-	double dmin, dmax;
-	cv::minMaxLoc(disp, &dmin, &dmax);
-	cv::Mat dispf, dispc;
-	disp.convertTo(dispf, CV_32FC1);
-	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
-
-	#if OPENCV_VERSION >= 40102
-	cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
-	#else
-	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
-	#endif
-}
+#include "algorithms.hpp"
 
-////////////////////////////////////////////////////////////////////////////////
+#include <cuda_runtime.h>
 
-struct MiddleburyData {
-	const std::string name;
-	const cv::Mat imL;
-	const cv::Mat imR;
-	const cv::Mat gtL;
-	const cv::Mat maskL;
-	const MiddEvalCalib calib;
-};
+#include "../../components/common/cpp/include/ftl/config.h"
 
 struct Result {
 	std::vector<MiddEvalResult> results;
 	cv::Mat disp;
 };
 
-static void run_censussgm(MiddleburyData &data, cv::Mat &disparity) {
-	auto stereo = StereoCensusSgm();
-	stereo.params.P1 = 4;
-	stereo.params.P2 = 18;
-
-	stereo.params.d_min = data.calib.vmin;
-	stereo.params.d_max = data.calib.vmax;
-	stereo.params.subpixel = 1;
-	stereo.params.debug = false;
-	stereo.compute(data.imL, data.imR, disparity);
-}
-
-static void run_misgm(MiddleburyData &data, cv::Mat &disparity) {
-	auto stereo = StereoMiSgm();
-	stereo.params.P1 = 4;
-	stereo.params.P2 = 18;
-
-	stereo.params.d_min = data.calib.vmin;
-	stereo.params.d_max = data.calib.vmax;
-	stereo.params.subpixel = 1;
-	stereo.params.debug = false;
-	stereo.compute(data.imL, data.imR, disparity);
-	stereo.setPrior(disparity);
-	stereo.compute(data.imL, data.imR, disparity);
-}
-
-
-static void run_gtsgm(MiddleburyData &data, cv::Mat &disparity) {
-	auto stereo = StereoGtSgm();
-	stereo.params.P1 = 0.1;
-	stereo.params.P2 = 1.0;
-
-	stereo.params.d_min = data.calib.vmin;
-	stereo.params.d_max = data.calib.vmax;
-	stereo.params.subpixel = 1;
-	stereo.params.debug = true;
-	stereo.setPrior(data.gtL);
-	stereo.compute(data.imL, data.imR, disparity);
-}
-
-static const std::map<std::string, std::function<void(MiddleburyData&, cv::Mat&)>> algorithms = {
-	{ "censussgm", run_censussgm },
-	{ "misgm", run_misgm },
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-MiddleburyData load_input(const std::string &path) {
-	cv::Mat imL;
-	cv::Mat imR;
-	cv::Mat gtL;
-	cv::Mat maskL;
-
-	imL = cv::imread(path + std::string("im0.png"));
-	imR = cv::imread(path + std::string("im1.png"));
-	gtL = read_pfm(path + std::string("disp0.pfm"));
-	if (gtL.empty()) {
-		gtL = read_pfm(path + std::string("disp0GT.pfm"));
-	}
-	//maskL = cv::imread(argv[1] + std::string("mask0nocc.png"), cv::IMREAD_GRAYSCALE);
-	auto calib = read_calibration(path + std::string("calib.txt"));
-
-	maskL.create(imL.size(), CV_8UC1);
-	maskL.setTo(cv::Scalar(255));
-
-	if (imL.empty() || imR.empty() || gtL.empty() || maskL.empty()) {
-		throw std::exception();
-	}
-
-	std::string name;
-	name = path.substr(0, path.size()-1);
-	name = name.substr(name.find_last_of("/") + 1);
-
-	return {name, imL, imR, gtL, maskL, calib};
-}
-
-Result run_one(MiddleburyData &input, std::function<void(MiddleburyData&, cv::Mat&)> f) {
+Result run_one(MiddleburyData &input, Algorithm *stereo) {
 	Result result;
-	f(input, result.disp);
-	for (const float t : {4.0f, 1.0f, 0.5f, 0.25f}) {
+	stereo->run(input, result.disp);
+	for (const float t : {4.0f, 2.0f, 1.0f, 0.5f}) {
 		result.results.push_back(evaluate(result.disp, input.gtL, input.maskL, t));
 	}
 	return result;
@@ -139,7 +34,7 @@ Result run_one(MiddleburyData &input, std::string name) {
 std::map<std::string, Result> run_all(MiddleburyData &input) {
 	std::map<std::string, Result> results;
 
-	for (auto const& [name, f] : algorithms) {
+	for (auto &[name, f] : algorithms) {
 		results[name] = run_one(input, f);
 	}
 
@@ -148,15 +43,38 @@ std::map<std::string, Result> run_all(MiddleburyData &input) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+/**
+ * @param   disp    disparity map
+ * @param   out     output parameter
+ * @param   ndisp   number of disparities
+ *
+ * If ndisp is greater than zero, color mapping is scaled to range [0, ndisp],
+ * otherwise scaling is automatically set to [0, max(disp)].
+ */
+void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
+	double dmin, dmax;
+	cv::minMaxLoc(disp, &dmin, &dmax);
+	cv::Mat dispf, dispc;
+	disp.convertTo(dispf, CV_32FC1);
+	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
+
+	#if OPENCV_VERSION >= 40102
+	cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
+	#else
+	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+	#endif
+}
+
 cv::Mat visualize_error(const cv::Mat &disp, const cv::Mat &gt, float t) {
 	cv::Mat err(gt.size(), CV_32FC1);
 	cv::Mat err_color(gt.size(), CV_8UC1);
+	float bad_t = 0.75;
 
 	cv::absdiff(gt, disp, err);
 	err.setTo(t, err > t);
-	err += 1.0f;
+	err += bad_t;
 	err.setTo(0, disp == 0.0);
-	err.convertTo(err, CV_8UC1, 255.0f/(t+1.0f));
+	err.convertTo(err, CV_8UC1, 255.0f/(t+bad_t));
 	cv::applyColorMap(err, err_color, cv::COLORMAP_PLASMA);
 
 	return err_color;
@@ -167,8 +85,12 @@ void show_result(std::string name, const MiddleburyData &data, const Result &res
 	cv::Mat disp_color(data.gtL.size(), CV_8UC1);
 
 	colorize(result.disp, disp_color, data.calib.ndisp);
-	cv::imshow(data.name + " (disparity) - " + name, disp_color);
-	cv::imshow(data.name + " (error) - " + name, err_color);
+	cv::Mat tmp;
+	float scale = 1280.0f / float(disp_color.cols);
+	cv::resize(disp_color, tmp, cv::Size(disp_color.cols*scale, disp_color.rows*scale));
+	cv::imshow(data.name + " (disparity) - " + name, tmp);
+	cv::resize(err_color, tmp, cv::Size(err_color.cols*scale, err_color.rows*scale));
+	cv::imshow(data.name + " (error) - " + name, tmp);
 
 	printf("\n%s: %s\n", name.c_str(), data.name.c_str());
 	int nt = result.results.size();
@@ -193,20 +115,139 @@ void show_result(std::string name, const MiddleburyData &data, const Result &res
 	printf("\n");
 
 	printf("%-8s", "RMSE");
-	for (auto res : result.results) { printf("%8.2fpx ", res.rms_good); }
+	for (auto res : result.results) { printf("%8.2fpx ", sqrtf(res.mse_good)); }
 	printf("\n");
+
+	printf("%s: %2.2fpx, %s: %2.2fpx\n",	"avgerr", result.results[0].avgerr,
+											"RMSE (total)", sqrtf(result.results[0].mse_total));
 }
 
-void show_results( const MiddleburyData &data, const std::map<std::string, Result> &results) {
-	for (auto const& [name, result] : results) {
-		show_result(name, data, result);
-	}
+void save_result(std::string name, const MiddleburyData &data, const Result &result) {
+	cv::Mat err_color = visualize_error(result.disp, data.gtL, 4.0);
+	cv::Mat disp_color(data.gtL.size(), CV_8UC1);
+	colorize(result.disp, disp_color, data.calib.ndisp);
+	cv::imwrite(data.name + "_" + name + "_disp.png", disp_color);
+	cv::imwrite(data.name + "_" + name + "_err.png", err_color);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using cv::Mat;
-using std::vector;
+/**
+ * Calculate disparities and errors for all Middlebury pairs in paths using
+ * Algorithms in algorithms (if not specified, uses all available algorithms).
+ * Results visualized and error values printed to stdout. Visualization can be
+ * saved.
+ */
+void main_default(const std::vector<std::string> &paths,
+		const std::vector<std::string> &algorithms) {
+
+	std::vector<std::pair<MiddleburyData, std::map<std::string, Result>>> results;
+
+	int devices=0;
+	cudaGetDeviceCount(&devices);
+	cudaSetDevice(devices-1);
+
+	for (const auto &dir : paths) {
+		auto input = load_input(dir);
+		std::map<std::string, Result> result;
+
+		if (algorithms.size() == 0) {
+			result = run_all(input);
+		}
+		else {
+			for (const auto &algorithm : algorithms) {
+				result[algorithm] = run_one(input, algorithm);
+			}
+		}
+
+		results.push_back(std::make_pair(input, result));
+	}
+
+	for (const auto &[input, data] : results) {
+		for (const auto &[name, result] : data) {
+			show_result(name, input, result);
+		}
+	}
+
+	printf("\nPress S to save (visualization to working directory), ESC to exit\n");
+
+	while(uchar k = cv::waitKey()) {
+		if (k == 's') {
+			for (const auto &[input, data] : results) {
+				for (const auto &[name, result] : data) {
+					save_result(name, input, result);
+				}
+			}
+			std::cout << "Saved\n";
+		}
+		else if (k == 27) {
+			return;
+		}
+	}
+}
+
+/**
+ * Perform grid search for P1 and P2 with P2 >= P1 minimizing total (R)MSE for
+ * whole dataset. Should be used with masked ground truth.
+ * (occluded areas not useful)
+ */
+void main_gridsearch_p1p2(const std::vector<std::string> &paths,
+		const std::vector<std::string> &algorithms_str,
+		float P1_min=0, float P1_max=128, float P2_max=256, float step=1.0) {
+
+	// Expects path to dataset directory. TODO: If empty, try loading paths
+	// separately (assume each path points to directory containing one dataset
+	// image pair)
+	auto dataset = load_directory(paths.at(0));
+
+	for (auto s : algorithms_str) {
+		Algorithm* stereo = algorithms.at(s);
+		// density aftel l/r cheeck will affect MSE, mask should be used to
+		// ignore occlusions
+		stereo->lr_consistency = false;
+		float err_min = INFINITY;
+		float best_P1 = 0.0f;
+		float best_P2 = 0.0f;
+
+		// first axis: P1, second axis P2
+		std::vector<std::vector<float>> search_results(P1_max/step);
+		for (auto &row : search_results) {
+			row = std::vector<float>(P2_max/step, 0.0f);
+		}
+
+		for (float P1 = P1_min; P1 < P1_max; P1 += step) {
+			for (float P2 = P1; P2 < P2_max; P2 += step) { // constraint: P2 >= P1
+
+				stereo->P1 = P1;
+				stereo->P2 = P2;
+				float mse = 0.0;
+				float n = 0.0;
+
+				for (auto &input : dataset) {
+					auto res = run_one(input, stereo);
+					auto &res2 = res.results[2];
+
+					float n_update = res2.n - res2.invalid;
+					n += n_update;
+					mse += res2.mse_total*n_update;
+				}
+				mse /= n;
+				search_results.at((P1 - P1_min)/step)
+							  .at((P2 - P1_min)/step) = mse;
+
+				if (mse < err_min) {
+					//printf("RMSE: %.4f (P1: %.0f, P2: %.0f)\n", sqrtf(mse), P1, P2);
+					err_min = mse;
+					best_P1 = P1;
+					best_P2 = P2;
+				}
+			}
+		}
+
+		printf("Best paramters for %s: P1=%.0f, P2=%0.f\n", s.c_str(), best_P1, best_P2);
+		// TODO: save full results in csv (heatmap plot?)
+	}
+}
 
 int main(int argc, char* argv[]) {
 	#if USE_GPU
@@ -215,28 +256,55 @@ int main(int argc, char* argv[]) {
 	std::cout << "CPU VERSION\n";
 	#endif
 
-	if (argc < 2) {
-		std::cerr << "usage: path [algorithm]\n";
-		return 1;
+	std::vector<std::string> paths;
+	std::vector<std::string> algorithms;
+	char test_type = 'd';
+	int opt;
+
+	while ((opt = getopt(argc, argv, "d:t:a:")) != -1) {
+		switch (opt) {
+			case 'a':
+				algorithms.push_back(optarg);
+				break;
+
+			case 'd':
+				paths.push_back(optarg);
+				break;
+
+			case 't':
+				test_type = optarg[0];
+				break;
+
+			default: /* '?' */
+				printf("Usage: %s [-d directory] [-t test type]\n", argv[0]);
+				exit(EXIT_FAILURE);
+		}
 	}
 
-	if (argc == 2) {
-		auto input = load_input(argv[1]);
-		auto results = run_all(input);
-		show_results(input, results);
-		while(cv::waitKey() != 27);
-		return 0;
-	}
-	else if (argc == 3) {
-		auto input = load_input(argv[1]);
-		auto result = run_one(input, argv[2]);
-		show_result(argv[2], input, result);
-		while(cv::waitKey() != 27);
-		return 0;
+	if (paths.size() == 0) {
+		printf("At least one directory required.");
+		exit(EXIT_FAILURE);
 	}
 
-	// todo: save results
+	switch (test_type) {
+		case 'd':
+			/* Default: calculate disparities with given algorithms (or all if
+			 * not specified) and visualize results */
+			main_default(paths, algorithms);
+			exit(EXIT_SUCCESS);
+			break;
+
+		case 'g':
+			/* grid search P1 and P2 for given algorithms
+			 *(parse additional options using getops again?) */
+			main_gridsearch_p1p2(paths, algorithms);
+			exit(EXIT_SUCCESS);
+			break;
+
+		default:
+			printf("Unknown test type %c", test_type);
+			exit(EXIT_FAILURE);
+	}
 
-	std::cerr << "error";
-	return 1;
+	return 0;
 }
diff --git a/lib/libstereo/middlebury/middlebury.cpp b/lib/libstereo/middlebury/middlebury.cpp
index e9c607c57030d76cea584c3954935ff2d8431d51..34d2892409f984d78052de944d3aaf1cd0a61b16 100644
--- a/lib/libstereo/middlebury/middlebury.cpp
+++ b/lib/libstereo/middlebury/middlebury.cpp
@@ -8,6 +8,10 @@
 #include <vector>
 
 #include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
 
 cv::Mat read_pfm(const std::string &filename) {
 	cv::Mat im;
@@ -114,11 +118,65 @@ MiddEvalCalib read_calibration(const std::string &filename) {
 		}
 	}
 
+	if (calib.vmax == 0) {
+		calib.vmax = calib.ndisp;
+	}
+
 	return calib;
 }
 
+MiddleburyData load_input(const fs::path &path) {
+	cv::Mat imL;
+	cv::Mat imR;
+	cv::Mat gtL;
+	cv::Mat maskL;
+
+	imL = cv::imread(path/"im0.png");
+	imR = cv::imread(path/"im1.png");
+	gtL = read_pfm(path/"disp0.pfm");
+	if (gtL.empty()) {
+		gtL = read_pfm(path/std::string("disp0GT.pfm"));
+	}
+	if (gtL.empty()) {
+		gtL.create(imL.size(), CV_32FC1);
+		gtL.setTo(cv::Scalar(0.0f));
+	}
+
+	maskL = cv::imread(path/std::string("mask0nocc.png"), cv::IMREAD_GRAYSCALE);
+	if (maskL.empty()) {
+		maskL.create(imL.size(), CV_8UC1);
+		maskL.setTo(cv::Scalar(255));
+	}
+
+	auto calib = read_calibration(path/std::string("calib.txt"));
+	if (imL.empty() || imR.empty() || gtL.empty() || maskL.empty()) {
+		throw std::exception();
+	}
+
+	std::string name = path.filename() == "." ?
+		path.parent_path().filename().c_str() :
+		path.filename().c_str();
+
+	return {name, imL, imR, gtL, maskL, calib};
+}
+
+MiddleburyData load_input(const std::string &path) {
+	return load_input(fs::path(path));
+}
+
+std::vector<MiddleburyData> load_directory(const std::string &path) {
+	std::vector<MiddleburyData> res;
+	for(auto& p: fs::directory_iterator(path)) {
+		try {
+			res.push_back(load_input(p.path()));
+		}
+		catch (...) {}
+	}
+	return res;
+}
+
 MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gtdisp, const cv::Mat &mask, float threshold) {
-	MiddEvalResult result {0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+	MiddEvalResult result {0, 0, 0, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
 	if (gtdisp.type() != CV_32FC1) { throw std::exception(); }
 	if (disp.type() != CV_32FC1) { throw std::exception(); }
 	if (!mask.empty() && mask.type() != CV_8UC1) { throw std::exception(); }
@@ -134,9 +192,9 @@ MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gtdisp, const cv::Ma
 	int good = 0;
 	int &invalid = result.invalid;
 
-	float &rms_good = result.rms_good;
-	float &rms_bad = result.rms_bad;
-
+	float &mse_good = result.mse_good;
+	float &mse_bad = result.mse_bad;
+	float &mse_total = result.mse_total;
 	float serr = 0;
 
 	// evaldisp.cpp from middlebury SDK
@@ -161,12 +219,14 @@ MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gtdisp, const cv::Ma
 			n++;
 			if (valid) {
 				serr += err;
+				mse_total += err*err;
+
 				if (err > threshold) {
 					bad++;
-					rms_bad += err*err;
+					mse_bad += err*err;
 				}
 				else {
-					rms_good += err*err;
+					mse_good += err*err;
 					good++;
 				}
 			} else {// invalid (i.e. hole in sparse disp map)
@@ -176,12 +236,13 @@ MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gtdisp, const cv::Ma
 	}
 	}
 
-	rms_bad = sqrtf(rms_bad/float(bad));
-	rms_good = sqrtf(rms_good/float(good));
+	mse_bad = mse_bad/float(bad);
+	mse_good = mse_good/float(good);
+	mse_total = mse_total/float(n - invalid);
+	result.avgerr = serr/float(n - invalid);
 	result.err_bad = float(bad)/float(n);
 	result.err_invalid = float(invalid)/float(n);
 	result.err_total = float(bad+invalid)/float(n);
-	float avgErr = serr / float(n - invalid);
 	return result;
 }
 
diff --git a/lib/libstereo/middlebury/middlebury.hpp b/lib/libstereo/middlebury/middlebury.hpp
index 3355860eec91bc4f6f25bf0ae9e6d46156a983da..0816fa0f5ba15fc97abc6a64077b873db6f3033e 100644
--- a/lib/libstereo/middlebury/middlebury.hpp
+++ b/lib/libstereo/middlebury/middlebury.hpp
@@ -9,8 +9,10 @@ struct MiddEvalResult {
 	float err_bad;
 	float err_invalid;
 	float err_total;
-	float rms_bad;
-	float rms_good;
+	float mse_bad;
+	float mse_good;
+	float mse_total;
+	float avgerr;
 	float threshold;
 };
 
@@ -33,6 +35,25 @@ MiddEvalResult evaluate(const cv::Mat &disp, const cv::Mat &gt, const cv::Mat &m
 
 cv::Mat read_pfm(const std::string &filename);
 
+struct MiddleburyData {
+	const std::string name;
+	const cv::Mat imL;
+	const cv::Mat imR;
+	const cv::Mat gtL;
+	const cv::Mat maskL;
+	const MiddEvalCalib calib;
+};
+
+/** Load one middlebury dataset image
+ * @param path path to image directory
+ */
+MiddleburyData load_input(const std::string &path);
+
+/** Load all middlebury dataset images
+ * @param path path to middlebury datase directory
+ */
+std::vector<MiddleburyData> load_directory(const std::string &path);
+
 /**
  * Add gaussian noise to image.
  *
diff --git a/lib/libstereo/src/aggregations/adaptive_penalty.hpp b/lib/libstereo/src/aggregations/adaptive_penalty.hpp
index a978d8e36604cbb081a6997a824a257de9b4d2d0..2be27490c0818f3a0db477321a9849a0d7b0afcd 100644
--- a/lib/libstereo/src/aggregations/adaptive_penalty.hpp
+++ b/lib/libstereo/src/aggregations/adaptive_penalty.hpp
@@ -17,13 +17,13 @@ struct AdaptivePenaltySGM {
 	const DSIIN in;
 	typename Array2D<costtype_t>::Data min_cost_all;
 
-	const int P1;
+	const costtype_t P1;
 
 	// Provided internally
 	typename DisparitySpaceImage<costtype_t>::DataType out;
 	typename DisparitySpaceImage<costtype_t>::DataType previous;
 	typename DisparitySpaceImage<costtype_t>::DataType updates;
-	typename Array2D<uint8_t>::Data penalties;
+	typename Array2D<costtype_t>::Data penalties;
 
 	struct PathData : BasePathData<costtype_t> {
 		// Custom path data goes here...
@@ -35,7 +35,7 @@ struct AdaptivePenaltySGM {
 	struct DirectionData {
 		DisparitySpaceImage<costtype_t> previous;
 		DisparitySpaceImage<costtype_t> updates;
-		Array2D<uint8_t> penalties;
+		Array2D<costtype_t> penalties;
 	};
 
 	/* Initialise buffers for a new path direction. */
diff --git a/lib/libstereo/src/aggregations/cp_sgm.hpp b/lib/libstereo/src/aggregations/cp_sgm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..15efbad15bbaf9e41c5a435ea892a97828afa4da
--- /dev/null
+++ b/lib/libstereo/src/aggregations/cp_sgm.hpp
@@ -0,0 +1,128 @@
+#ifndef _FTL_LIBSTEREO_AGGREGATIONS_CONFPRIOR_HPP_
+#define _FTL_LIBSTEREO_AGGREGATIONS_CONFPRIOR_HPP_
+
+#include "../dsi.hpp"
+#include "../array2d.hpp"
+
+namespace ftl {
+namespace stereo {
+namespace aggregations {
+
+template <typename DSIIN>
+struct ConfidencePriorSGM {
+	typedef typename DSIIN::Type Type;
+	typedef typename DSIIN::Type costtype_t;
+
+	// Provided externally
+	const DSIIN in;
+	typename Array2D<costtype_t>::Data min_cost_all;
+    typename Array2D<uchar>::Data conf_prior;
+
+	const costtype_t P1;
+	const costtype_t P2;
+
+	// Provided internally
+	typename DisparitySpaceImage<costtype_t>::DataType out;
+	typename DisparitySpaceImage<costtype_t>::DataType previous;
+	typename DisparitySpaceImage<costtype_t>::DataType updates;
+
+	struct PathData : BasePathData<costtype_t> {
+		// Custom path data goes here...
+		costtype_t previous_cost_min;
+		costtype_t *previous;
+		costtype_t *current;
+	};
+
+	struct DirectionData {
+		DisparitySpaceImage<costtype_t> previous;
+		DisparitySpaceImage<costtype_t> updates;
+	};
+
+	/* Initialise buffers for a new path direction. */
+	void direction(DirectionData &data, DisparitySpaceImage<costtype_t> &buffer) {
+		out = buffer.data();
+		data.previous.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		data.updates.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		previous = data.previous.data();
+		updates = data.updates.data();
+	}
+
+	/* Reset buffers to start a new path */
+	__host__ __device__ inline void startPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		data.previous = &previous(0,data.pathix,previous.disp_min);
+		data.current = &updates(0,data.pathix,updates.disp_min);
+
+		for (int d=thread; d<=previous.disp_min; d+=stride) {
+			data.previous[d] = 0;
+		}
+
+		// To ensure all threads have finished clearing the buffer
+		#ifdef __CUDA_ARCH__
+		__syncwarp();
+		#endif
+	}
+
+	__host__ __device__ inline void endPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {}
+
+	/* Main SGM cost aggregation function */
+	__host__ __device__ inline costtype_t calculateCost(ushort2 pixel, int d, costtype_t *previous, int size, costtype_t previous_cost_min) {
+		const costtype_t L_min =
+			min(previous[d],
+				min(costtype_t(previous_cost_min + P2),
+						min(costtype_t(previous[min(d+1,size)] + P1),
+									costtype_t(previous[max(d-1,0)] + P1))
+			)
+		);
+
+		// Note: This clamping to min 0 does make a tiny difference
+		auto cost = L_min + in(pixel.y,pixel.x,d+in.disp_min);
+		return (cost > previous_cost_min) ? cost - previous_cost_min : 0;
+	}
+
+	/* Stride over each disparity and calculate minimum cost */
+	__host__ __device__ inline void operator()(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		#ifndef __CUDA_ARCH__
+		using std::min;
+		using std::max;
+		#endif
+
+		const int d_stop = int(out.disp_max)-int(out.disp_min);
+
+		costtype_t min_cost = 255;
+
+		// For each disparity (striding the threads)
+		uchar p = conf_prior(pixel.y, pixel.x);
+		for (int d=thread; d<=d_stop; d+=stride) {
+			auto c = (p == 0) ? calculateCost(pixel, d, data.previous, d_stop, data.previous_cost_min) : (p == d+in.disp_min) ? 0 : P2;
+
+			out(pixel.y,pixel.x,d+in.disp_min) += c;
+			data.current[d] = c;
+
+			// Record the smallest disparity cost for this pixel
+			min_cost = (c < min_cost) ? c : min_cost;
+		}
+
+		// WARP Aggregate on GPU only (assumes stride = 1 on CPU)
+		// Min cost from each thread must be combined for overall minimum
+		// Each thread then obtains thread global minimum
+		#ifdef __CUDA_ARCH__
+		min_cost = warpMin(min_cost);
+		#else
+		// add assert
+		#endif
+
+		data.previous_cost_min = min_cost;
+		min_cost_all(pixel.y,pixel.x) += min_cost; // atomic?
+
+		// Swap current and previous cost buffers
+		costtype_t *tmp_ptr = const_cast<costtype_t *>(data.previous);
+		data.previous = data.current;
+		data.current = tmp_ptr;
+	}
+};
+
+}
+}
+}
+
+#endif
diff --git a/lib/libstereo/src/aggregations/standard_sgm.hpp b/lib/libstereo/src/aggregations/standard_sgm.hpp
index 2830351bcf327efda973a854029f63cec5c59bb4..32b41abcc2baab03a92905d3b87173e9fb2677cc 100644
--- a/lib/libstereo/src/aggregations/standard_sgm.hpp
+++ b/lib/libstereo/src/aggregations/standard_sgm.hpp
@@ -17,8 +17,8 @@ struct StandardSGM {
 	const DSIIN in;
 	typename Array2D<costtype_t>::Data min_cost_all;
 
-	const int P1;
-	const int P2;
+	const costtype_t P1;
+	const costtype_t P2;
 
 	// Provided internally
 	typename DisparitySpaceImage<costtype_t>::DataType out;
diff --git a/lib/libstereo/src/algorithms/brefcensussgm.cu b/lib/libstereo/src/algorithms/brefcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eea64c294e16957ccfa4b5e25c4256d89812fbf8
--- /dev/null
+++ b/lib/libstereo/src/algorithms/brefcensussgm.cu
@@ -0,0 +1,48 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+struct StereoBRefCensusSgm::Impl : public StereoSgm<CensusMatchingCost, StereoBRefCensusSgm::Parameters> {
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<uchar> bl;
+    Array2D<uchar> br;
+
+	Impl(StereoBRefCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height),
+        bl(width, height), br(width, height) {}
+};
+
+StereoBRefCensusSgm::StereoBRefCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoBRefCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+    mat2gray(r, impl_->r);
+    
+    cv::cuda::bilateralFilter(impl_->l.toGpuMat(), impl_->bl.toGpuMat(), 5, 50, 100);
+    cv::cuda::bilateralFilter(impl_->r.toGpuMat(), impl_->br.toGpuMat(), 5, 50, 100);
+
+	impl_->cost.set(impl_->bl, impl_->br, impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoBRefCensusSgm::~StereoBRefCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/censussgm.cu b/lib/libstereo/src/algorithms/censussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c15e5275f2eae48a07f2f7758c77a7fdb333c97
--- /dev/null
+++ b/lib/libstereo/src/algorithms/censussgm.cu
@@ -0,0 +1,40 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+
+struct StereoCensusSgm::Impl : public StereoSgm<CensusMatchingCost, StereoCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.setPattern(params.pattern);
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoCensusSgm::~StereoCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/excensussgm.cu b/lib/libstereo/src/algorithms/excensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ea34333e0358145fca859c06fd7e41048004174
--- /dev/null
+++ b/lib/libstereo/src/algorithms/excensussgm.cu
@@ -0,0 +1,101 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+#include "costs/scale.hpp"
+
+#include <opencv2/cudafilters.hpp>
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+
+typedef unsigned short CostType;
+typedef WeightedCost<ExpandingCensusMatchingCost, CostType> MatchingCost;
+
+
+struct StereoExCensusSgm::Impl : public StereoSgm<MatchingCost, StereoExCensusSgm::Parameters> {
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<float> variance;
+    Array2D<float> variance_r;
+    ExpandingCensusMatchingCost excensus;
+
+	Impl(StereoExCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height),
+        variance(width,height), variance_r(width,height),
+        excensus(width, height, dmin, dmax) {
+            cost.setCost(excensus);
+            cost.setWeights(variance, variance_r);
+        }
+};
+
+StereoExCensusSgm::StereoExCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoExCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+    mat2gray(r, impl_->r);
+    
+    cv::cuda::GpuMat var_l = impl_->variance.toGpuMat();
+	variance_mask(impl_->l.toGpuMat(), var_l, params.var_window);
+	cv::cuda::GpuMat var_r = impl_->variance_r.toGpuMat();
+    variance_mask(impl_->r.toGpuMat(), var_r, params.var_window);
+    
+    cv::cuda::normalize(var_l, var_l, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+	cv::cuda::normalize(var_r, var_r, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+	impl_->excensus.setPattern(params.pattern);
+    impl_->excensus.set(impl_->l, impl_->r);
+    impl_->cost.set();
+
+	cudaSafeCall(cudaDeviceSynchronize());
+    impl_->compute(disparity);
+    
+    Array2D<ExpandingCensusMatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(impl_->cost, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoExCensusSgm::~StereoExCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/hcensussgm.cu b/lib/libstereo/src/algorithms/hcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7cd4c00d799f339a1bdbcadf216661b95ff6b663
--- /dev/null
+++ b/lib/libstereo/src/algorithms/hcensussgm.cu
@@ -0,0 +1,131 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+#include "../costs/dual.hpp"
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudafilters.hpp>
+#include <opencv2/highgui.hpp>
+
+typedef MultiCostsWeighted<MiniCensusMatchingCost,3> MatchingCost;
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+struct StereoHierCensusSgm::Impl : public StereoSgm<MatchingCost, StereoHierCensusSgm::Parameters> {
+    MiniCensusMatchingCost cost_fine;
+    MiniCensusMatchingCost cost_medium;
+    MiniCensusMatchingCost cost_coarse;
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<float> var_fine;
+    Array2D<float> var_medium;
+    Array2D<float> var_coarse;
+
+	Impl(StereoHierCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax),
+        cost_fine(width, height, dmin, dmax),
+        cost_medium(width, height, dmin, dmax),
+        cost_coarse(width, height, dmin, dmax),
+        l(width, height), r(width, height),
+        var_fine(width, height),
+        var_medium(width, height),
+        var_coarse(width, height) {
+            cost.add(0, cost_fine, var_fine);
+            cost.add(1, cost_medium, var_medium);
+            cost.add(2, cost_coarse, var_coarse);
+        }
+};
+
+StereoHierCensusSgm::StereoHierCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoHierCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+    timer_set();
+
+    static constexpr int DOWNSCALE_MEDIUM = 4;
+    static constexpr int DOWNSCALE_COARSE = 6;
+    
+    Array2D<uchar> medium_l(l.cols()/DOWNSCALE_MEDIUM, l.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> medium_r(r.cols()/DOWNSCALE_MEDIUM, r.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> coarse_l(l.cols()/DOWNSCALE_COARSE, l.rows()/DOWNSCALE_COARSE);
+    Array2D<uchar> coarse_r(r.cols()/DOWNSCALE_COARSE, r.rows()/DOWNSCALE_COARSE);
+    cv::cuda::resize(impl_->l.toGpuMat(), medium_l.toGpuMat(), cv::Size(medium_l.width, medium_r.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), medium_r.toGpuMat(), cv::Size(medium_r.width, medium_r.height));
+    cv::cuda::resize(impl_->l.toGpuMat(), coarse_l.toGpuMat(), cv::Size(coarse_l.width, coarse_l.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), coarse_r.toGpuMat(), cv::Size(coarse_r.width, coarse_r.height));
+
+    cv::cuda::GpuMat var_fine = impl_->var_fine.toGpuMat();
+    variance_mask(impl_->l.toGpuMat(), var_fine, params.var_window);
+    cv::cuda::normalize(var_fine, var_fine, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+    cv::cuda::GpuMat var_medium; // = impl_->var_medium.toGpuMat();
+    variance_mask(medium_l.toGpuMat(), var_medium, params.var_window);
+    cv::cuda::normalize(var_medium, var_medium, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_medium, impl_->var_medium.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::cuda::GpuMat var_coarse; // = impl_->var_coarse.toGpuMat();
+    variance_mask(coarse_l.toGpuMat(), var_coarse, params.var_window);
+    cv::cuda::normalize(var_coarse, var_coarse, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_coarse, impl_->var_coarse.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::Mat tmp;
+    impl_->var_coarse.toGpuMat().download(tmp);
+    cv::imshow("Var", tmp);
+
+	// CT
+    impl_->cost_fine.set(impl_->l, impl_->r);
+    impl_->cost_medium.set(impl_->l, impl_->r, medium_l, medium_r);
+    impl_->cost_coarse.set(impl_->l, impl_->r, coarse_l, coarse_r);
+    impl_->cost.set();
+    impl_->compute(disparity);
+    
+    Array2D<ExpandingCensusMatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(impl_->cost, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoHierCensusSgm::~StereoHierCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/hstablesgm.cu b/lib/libstereo/src/algorithms/hstablesgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7e9732ea8e331e558ad33333e799f4507ed3808
--- /dev/null
+++ b/lib/libstereo/src/algorithms/hstablesgm.cu
@@ -0,0 +1,129 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/stable.hpp"
+#include "../costs/dual.hpp"
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudafilters.hpp>
+#include <opencv2/highgui.hpp>
+
+typedef MultiCostsWeighted<StableMatchingCost,3> MatchingCost;
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+struct StereoHStableSgm::Impl : public StereoSgm<MatchingCost, StereoHStableSgm::Parameters> {
+    StableMatchingCost cost_fine;
+    StableMatchingCost cost_medium;
+    StableMatchingCost cost_coarse;
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<float> var_fine;
+    Array2D<float> var_medium;
+    Array2D<float> var_coarse;
+
+	Impl(StereoHStableSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax),
+        cost_fine(width, height, dmin, dmax),
+        cost_medium(width, height, dmin, dmax),
+        cost_coarse(width, height, dmin, dmax),
+        l(width, height), r(width, height),
+        var_fine(width, height),
+        var_medium(width, height),
+        var_coarse(width, height) {
+            cost.add(0, cost_fine, var_fine);
+            cost.add(1, cost_medium, var_medium);
+            cost.add(2, cost_coarse, var_coarse);
+        }
+};
+
+StereoHStableSgm::StereoHStableSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoHStableSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+    timer_set();
+
+    static constexpr int DOWNSCALE_MEDIUM = 4;
+    static constexpr int DOWNSCALE_COARSE = 6;
+    
+    Array2D<uchar> medium_l(l.cols()/DOWNSCALE_MEDIUM, l.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> medium_r(r.cols()/DOWNSCALE_MEDIUM, r.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> coarse_l(l.cols()/DOWNSCALE_COARSE, l.rows()/DOWNSCALE_COARSE);
+    Array2D<uchar> coarse_r(r.cols()/DOWNSCALE_COARSE, r.rows()/DOWNSCALE_COARSE);
+    cv::cuda::resize(impl_->l.toGpuMat(), medium_l.toGpuMat(), cv::Size(medium_l.width, medium_r.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), medium_r.toGpuMat(), cv::Size(medium_r.width, medium_r.height));
+    cv::cuda::resize(impl_->l.toGpuMat(), coarse_l.toGpuMat(), cv::Size(coarse_l.width, coarse_l.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), coarse_r.toGpuMat(), cv::Size(coarse_r.width, coarse_r.height));
+
+    cv::cuda::GpuMat var_fine = impl_->var_fine.toGpuMat();
+    variance_mask(impl_->l.toGpuMat(), var_fine, params.var_window);
+    cv::cuda::normalize(var_fine, var_fine, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+    cv::cuda::GpuMat var_medium; // = impl_->var_medium.toGpuMat();
+    variance_mask(medium_l.toGpuMat(), var_medium, params.var_window);
+    cv::cuda::normalize(var_medium, var_medium, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_medium, impl_->var_medium.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::cuda::GpuMat var_coarse; // = impl_->var_coarse.toGpuMat();
+    variance_mask(coarse_l.toGpuMat(), var_coarse, params.var_window);
+    cv::cuda::normalize(var_coarse, var_coarse, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_coarse, impl_->var_coarse.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::Mat tmp;
+    impl_->var_coarse.toGpuMat().download(tmp);
+    cv::imshow("Var", tmp);
+
+    impl_->cost_fine.generateFilterMask(params.wsize, 16);
+    impl_->cost_medium.setFilter(impl_->cost_fine.getFilter());
+    impl_->cost_coarse.setFilter(impl_->cost_fine.getFilter());
+    impl_->cost_fine.set(impl_->l, impl_->r);
+    impl_->cost_medium.set(medium_l, medium_r, l.cols(), l.rows());
+    impl_->cost_coarse.set(coarse_l, coarse_r, l.cols(), l.rows());
+    impl_->cost.set();
+	impl_->compute(disparity);
+}
+
+StereoHStableSgm::~StereoHStableSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/hwcensussgm.cu b/lib/libstereo/src/algorithms/hwcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1a42e86092e34301eadcc7637d7fd8e07f886a0
--- /dev/null
+++ b/lib/libstereo/src/algorithms/hwcensussgm.cu
@@ -0,0 +1,131 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+#include "../costs/dual.hpp"
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudafilters.hpp>
+#include <opencv2/highgui.hpp>
+
+typedef MultiCostsWeighted<WeightedCensusMatchingCost,3> MatchingCost;
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+struct StereoHierWeightCensusSgm::Impl : public StereoSgm<MatchingCost, StereoHierWeightCensusSgm::Parameters> {
+    WeightedCensusMatchingCost cost_fine;
+    WeightedCensusMatchingCost cost_medium;
+    WeightedCensusMatchingCost cost_coarse;
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<float> var_fine;
+    Array2D<float> var_medium;
+    Array2D<float> var_coarse;
+
+	Impl(StereoHierWeightCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax),
+        cost_fine(width, height, dmin, dmax),
+        cost_medium(width, height, dmin, dmax),
+        cost_coarse(width, height, dmin, dmax),
+        l(width, height), r(width, height),
+        var_fine(width, height),
+        var_medium(width, height),
+        var_coarse(width, height) {
+            cost.add(0, cost_fine, var_fine);
+            cost.add(1, cost_medium, var_medium);
+            cost.add(2, cost_coarse, var_coarse);
+        }
+};
+
+StereoHierWeightCensusSgm::StereoHierWeightCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoHierWeightCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+    timer_set();
+
+    static constexpr int DOWNSCALE_MEDIUM = 4;
+    static constexpr int DOWNSCALE_COARSE = 6;
+    
+    Array2D<uchar> medium_l(l.cols()/DOWNSCALE_MEDIUM, l.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> medium_r(r.cols()/DOWNSCALE_MEDIUM, r.rows()/DOWNSCALE_MEDIUM);
+    Array2D<uchar> coarse_l(l.cols()/DOWNSCALE_COARSE, l.rows()/DOWNSCALE_COARSE);
+    Array2D<uchar> coarse_r(r.cols()/DOWNSCALE_COARSE, r.rows()/DOWNSCALE_COARSE);
+    cv::cuda::resize(impl_->l.toGpuMat(), medium_l.toGpuMat(), cv::Size(medium_l.width, medium_r.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), medium_r.toGpuMat(), cv::Size(medium_r.width, medium_r.height));
+    cv::cuda::resize(impl_->l.toGpuMat(), coarse_l.toGpuMat(), cv::Size(coarse_l.width, coarse_l.height));
+    cv::cuda::resize(impl_->r.toGpuMat(), coarse_r.toGpuMat(), cv::Size(coarse_r.width, coarse_r.height));
+
+    cv::cuda::GpuMat var_fine = impl_->var_fine.toGpuMat();
+    variance_mask(impl_->l.toGpuMat(), var_fine, params.var_window);
+    cv::cuda::normalize(var_fine, var_fine, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+    cv::cuda::GpuMat var_medium; // = impl_->var_medium.toGpuMat();
+    variance_mask(medium_l.toGpuMat(), var_medium, params.var_window);
+    cv::cuda::normalize(var_medium, var_medium, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_medium, impl_->var_medium.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::cuda::GpuMat var_coarse; // = impl_->var_coarse.toGpuMat();
+    variance_mask(coarse_l.toGpuMat(), var_coarse, params.var_window);
+    cv::cuda::normalize(var_coarse, var_coarse, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+    cv::cuda::resize(var_coarse, impl_->var_coarse.toGpuMat(), cv::Size(l.cols(), l.rows()));
+
+    cv::Mat tmp;
+    impl_->var_coarse.toGpuMat().download(tmp);
+    cv::imshow("Var", tmp);
+
+	// CT
+    impl_->cost_fine.set(impl_->l, impl_->r);
+    impl_->cost_medium.set(impl_->l, impl_->r, medium_l, medium_r);
+    impl_->cost_coarse.set(impl_->l, impl_->r, coarse_l, coarse_r);
+    impl_->cost.set();
+    impl_->compute(disparity);
+    
+    Array2D<ExpandingCensusMatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(impl_->cost, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoHierWeightCensusSgm::~StereoHierWeightCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/meancensussgm.cu b/lib/libstereo/src/algorithms/meancensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7058ba79f8f9432a61e8ac1e009cedd8d36ab52c
--- /dev/null
+++ b/lib/libstereo/src/algorithms/meancensussgm.cu
@@ -0,0 +1,48 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+#include <opencv2/cudafilters.hpp>
+
+struct StereoMeanCensusSgm::Impl : public StereoSgm<CensusMatchingCost, StereoMeanCensusSgm::Parameters> {
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<uchar> ml;
+    Array2D<uchar> mr;
+
+	Impl(StereoMeanCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+        StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height),
+        ml(width, height), mr(width, height) {}
+};
+
+StereoMeanCensusSgm::StereoMeanCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoMeanCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+    
+    auto filter = cv::cuda::createBoxFilter(CV_8UC1, CV_8UC1, cv::Size(5,5));
+    filter->apply(impl_->l.toGpuMat(), impl_->ml.toGpuMat());   // E[X]
+    filter->apply(impl_->r.toGpuMat(), impl_->mr.toGpuMat());   // E[X]
+
+    impl_->cost.set(impl_->ml, impl_->mr);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoMeanCensusSgm::~StereoMeanCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/stablesgm.cu b/lib/libstereo/src/algorithms/stablesgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..efaf20a289005f51c80711af4f6735efb2e8196c
--- /dev/null
+++ b/lib/libstereo/src/algorithms/stablesgm.cu
@@ -0,0 +1,40 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/stable.hpp"
+
+struct StereoStableSgm::Impl : public StereoSgm<StableMatchingCost, StereoStableSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoStableSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoStableSgm::StereoStableSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoStableSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.generateFilterMask(params.wsize, 16); // hardcoded in implementation
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoStableSgm::~StereoStableSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/stereosgm.hpp b/lib/libstereo/src/algorithms/stereosgm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a6369df049e51f1136320810cd62abe34c5e1d2
--- /dev/null
+++ b/lib/libstereo/src/algorithms/stereosgm.hpp
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/cudastereo.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#include "median_filter.hpp"
+#include "dsi_tools.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+template<typename MatchingCostT, typename ParamsT>
+struct StereoSgm {
+	ParamsT &params;
+	MatchingCostT cost;
+	Array2D<typename MatchingCostT::Type> cost_min_paths;
+	Array2D<typename MatchingCostT::Type> uncertainty;
+
+	PathAggregator<ftl::stereo::aggregations::StandardSGM<typename MatchingCostT::DataType>> aggr;
+	WinnerTakesAll<DisparitySpaceImage<typename MatchingCostT::Type>, float> wta;
+
+	StereoSgm(ParamsT &params, int width, int height, int min_disp, int max_disp) :
+		params(params),
+		cost(width, height, min_disp, max_disp),
+		cost_min_paths(width, height),
+		uncertainty(width, height)
+		{}
+
+	void compute(cv::OutputArray disparity) {
+
+		// cost aggregation
+		ftl::stereo::aggregations::StandardSGM<typename MatchingCostT::DataType> func = {cost.data(), cost_min_paths.data(), params.P1, params.P2};
+		auto &out = aggr(func, params.paths);
+
+		cudaSafeCall(cudaDeviceSynchronize());
+		if (params.debug) { timer_print("Aggregation"); }
+
+		wta(out, params.subpixel, params.lr_consistency);
+		cudaSafeCall(cudaDeviceSynchronize());
+		if (params.debug) { timer_print("WTA"); }
+
+		// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+		// Semi-global matching: A principled derivation in terms of
+		// message passing. Lecture Notes in Computer Science (Including Subseries
+		// Lecture Notes in Artificial Intelligence and Lecture Notes in
+		// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+
+		#if USE_GPU
+		cv::cuda::GpuMat uncertainty_gpumat = uncertainty.toGpuMat();
+		cv::cuda::subtract(wta.min_cost.toGpuMat(), cost_min_paths.toGpuMat(), uncertainty_gpumat);
+		cv::cuda::compare(uncertainty_gpumat, params.uniqueness, uncertainty_gpumat, cv::CMP_GT);
+		wta.disparity.toGpuMat().setTo(0, uncertainty_gpumat);
+		#else
+		cv::Mat uncertainty_mat = uncertainty.toMat();
+		cv::subtract(wta.min_cost.toMat(), cost_min_paths.toMat(), uncertainty_mat);
+		cv::compare(uncertainty_mat, params.uniqueness, uncertainty, cv::CMP_GT);
+		wta.disparity.toMat().setTo(0, uncertainty_mat);
+		#endif
+
+		median_filter(wta.disparity, disparity);
+		if (params.debug) { timer_print("median filter"); }
+	}
+};
diff --git a/lib/libstereo/src/algorithms/tcensussgm.cu b/lib/libstereo/src/algorithms/tcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a449937a13936b0a3d7f141cf36ac5ef44e4ad1
--- /dev/null
+++ b/lib/libstereo/src/algorithms/tcensussgm.cu
@@ -0,0 +1,40 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/tcensus.hpp"
+
+struct StereoTCensusSgm::Impl : public StereoSgm<TCensusMatchingCost, StereoTCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoTCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoTCensusSgm::StereoTCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoTCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	timer_set();
+
+	impl_->cost.setT(params.t);
+	impl_->cost.set(impl_->l, impl_->r);
+	impl_->compute(disparity);
+}
+
+StereoTCensusSgm::~StereoTCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/costs/census.cu b/lib/libstereo/src/costs/census.cu
index 3930e7333e4376ec3f8cb724066ddc2e1aef6ea2..f56cedf608d3eb87ccbdf078f563dccd1f85d688 100644
--- a/lib/libstereo/src/costs/census.cu
+++ b/lib/libstereo/src/costs/census.cu
@@ -7,44 +7,260 @@
 
 #include <cuda_runtime.h>
 
+template <typename T>
+__host__ __device__ inline T square(T v) { return v*v; }
+
 namespace algorithms {
+	/** Census transform. Bits stored in row major order. */
+	template<int WINX, int WINY>
+	struct CensusTransformRowMajor {
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+			short center = im(y, x);
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int y_ = y + wy;
+					const int x_ = x + wx;
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (center < (im(y_,x_)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+	/** Census transform. Bits stored in row major order. This variant uses a
+	 *  nested window that expands with distance from center, where the
+	 *  absolute max difference to center is used from that window. */
 	template<int WINX, int WINY>
-	struct CensusTransform {
-		static_assert(WINX*WINY <= 64, "Census window is too large");
+	struct ExCensusTransformRowMajor {
+		__host__ __device__ inline short subwindow(const int x, const int y, int wx, int wy, short center, ushort2 size) {
+			short value = center;
 
-		__host__ __device__ inline uint64_t ct_window(const int y, const int x) {
-			uint64_t ct = 0;
-			uchar center = im(y, x);
+			int nwx = 3*wx; //int(float(wx)*scale);
+			int nwy = 3*wy; //int(float(wy)*scale);
+			//int wsize = max(0,max(abs(wx),abs(wy))-2);
+			//if (x == 100 && y == 100) printf("Ex shape: %d, %d\n",nwx,nwy);
+			for (int dy=-max(0,abs(wy)-2); dy<=max(0,abs(wy)-2); ++dy) {
+			for (int dx=-max(0,abs(wx)-2); dx<=max(0,abs(wx)-2); ++dx) {
+				auto v = im(min(size.y, max(0,y+nwy+dy)), min(size.x, max(0,x+nwx+dx)));
+				if (abs(center-v) > abs(center-value)) value = v;
+			}
+			}
+
+			return value;
+		}
+
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out, ushort2 size) {
+			short center = im(y, x);
+			uint8_t i = 0; // bit counter for *out
 
 			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
 				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
 					const int y_ = y + wy;
 					const int x_ = x + wx;
-					ct = ct << 1;
-					ct |= (center < (im(y_,x_)) ? 1 : 0);
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (center < (subwindow(x,y,wx,wy,center,size)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)), size);
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+		float scale;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+	/* W. S. Fife and J. K. Archibald,
+	   "Improved Census Transforms for Resource-Optimized Stereo Vision,"
+	  in IEEE Transactions on Circuits and Systems for Video Technology,
+	  vol. 23, no. 1, pp. 60-73, Jan. 2013. */
+	template<int WINX, int WINY>
+	struct GCensusTransformRowMajor {
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+			//short center = im(y, x);
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					//const int y_ = y + wy;
+					//const int x_ = x + wx;
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (im(y-wy,x-wx) < (im(y+wy,x+wx)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
 				}
 			}
-			return ct;
 		}
 
 		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
-			//for (int y = WINY/2+1; y < size.y - (WINY/2+1); y++) {
-			//	for (int x = WINX/2+1; x < size.x - (WINX/2+1); x++) {
 			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
 				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
-					out(y, x) = ct_window(y, x);
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+	/** Census transform. Bits stored in row major order. */
+	template<int WINX, int WINY>
+	struct HCensusTransformRowMajor {
+		__host__ __device__ inline void window(short center, const int y, const int x, uint64_t* __restrict__ out) {
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const uint y_ = min(uint(y + wy), hsize.y-1);
+					const uint x_ = min(uint(x + wx), hsize.x-1);
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (center < (neigh(y_,x_)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y; y<size.y; y+=stride.y) {
+				for (int x = thread.x; x<size.x; x+=stride.x) {
+					short center = im(y, x);
+					float ny = float(y) / float(size.y) * float(hsize.y);
+					float nx = float(x) / float(size.x) * float(hsize.x);
+					window(center, ny, nx, &(out(y, x*WSTEP)));
 				}
 			}
 		}
 
 		Array2D<uchar>::Data im;
+		Array2D<uchar>::Data neigh;
 		Array2D<uint64_t>::Data out;
+		ushort2 hsize;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+	/** Census transform. Bits stored in row major order.
+	    @see GCensusTransform above. */
+	template<int WINX, int WINY>
+	struct HGCensusTransformRowMajor {
+		__host__ __device__ inline void window(short2 center, ushort2 size, const int y, const int x, uint64_t* __restrict__ out) {
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const uint ny = min(uint(y + wy), hsize.y-1);
+					const uint nx = min(uint(x + wx), hsize.x-1);
+					const uint cy = min(uint(center.y - wy), size.y-1);
+					const uint cx = min(uint(center.x - wx), size.x-1);
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (im(cy,cx) < (neigh(ny,nx)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y; y<size.y; y+=stride.y) {
+				for (int x = thread.x; x<size.x; x+=stride.x) {
+					//short center = im(y, x);
+					float ny = float(y) / float(size.y) * float(hsize.y);
+					float nx = float(x) / float(size.x) * float(hsize.x);
+					window({short(x),short(y)}, size, ny, nx, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uchar>::Data neigh;
+		Array2D<uint64_t>::Data out;
+		ushort2 hsize;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
 	};
 }
 
 void CensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
-	parallel2D<algorithms::CensusTransform<9,7>>({l.data(), ct_l_.data()}, l.width, l.height);
-	parallel2D<algorithms::CensusTransform<9,7>>({r.data(), ct_r_.data()}, r.width, r.height);
+	if (pattern_ == CensusPattern::STANDARD) {
+		parallel2D<algorithms::CensusTransformRowMajor<9,7>>({l.data(), ct_l_.data()}, l.width, l.height);
+		parallel2D<algorithms::CensusTransformRowMajor<9,7>>({r.data(), ct_r_.data()}, r.width, r.height);
+	} else if (pattern_ == CensusPattern::GENERALISED) {
+		parallel2D<algorithms::GCensusTransformRowMajor<9,7>>({l.data(), ct_l_.data()}, l.width, l.height);
+		parallel2D<algorithms::GCensusTransformRowMajor<9,7>>({r.data(), ct_r_.data()}, r.width, r.height);
+	} else {
+		// TODO: 
+	}
+}
+
+void CensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr) {
+	if (pattern_ == CensusPattern::STANDARD) {
+		parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+		parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	} else if (pattern_ == CensusPattern::GENERALISED) {
+		parallel2D<algorithms::HGCensusTransformRowMajor<9,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+		parallel2D<algorithms::HGCensusTransformRowMajor<9,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	}
 }
 
 void CensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
@@ -67,3 +283,119 @@ void CensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
 		throw std::exception();
 	}
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ExpandingCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::ExCensusTransformRowMajor<9,9>>({l.data(), ct_l_.data(),1.5f}, l.width, l.height);
+	parallel2D<algorithms::ExCensusTransformRowMajor<9,9>>({r.data(), ct_r_.data(),1.5f}, r.width, r.height);
+}
+
+/*void ExpandingCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr) {
+	parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+	parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+}*/
+
+void ExpandingCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void MiniCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	if (pattern_ == CensusPattern::STANDARD) {
+		parallel2D<algorithms::CensusTransformRowMajor<5,3>>({l.data(), ct_l_.data()}, l.width, l.height);
+		parallel2D<algorithms::CensusTransformRowMajor<5,3>>({r.data(), ct_r_.data()}, r.width, r.height);
+	} else if (pattern_ == CensusPattern::GENERALISED) {
+		parallel2D<algorithms::GCensusTransformRowMajor<5,3>>({l.data(), ct_l_.data()}, l.width, l.height);
+		parallel2D<algorithms::GCensusTransformRowMajor<5,3>>({r.data(), ct_r_.data()}, r.width, r.height);
+	} else {
+		// TODO: 
+	}
+}
+
+void MiniCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr) {
+	if (pattern_ == CensusPattern::STANDARD) {
+		parallel2D<algorithms::HCensusTransformRowMajor<5,3>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+		parallel2D<algorithms::HCensusTransformRowMajor<5,3>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	} else if (pattern_ == CensusPattern::GENERALISED) {
+		parallel2D<algorithms::HGCensusTransformRowMajor<5,3>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+		parallel2D<algorithms::HGCensusTransformRowMajor<5,3>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	}
+}
+
+void MiniCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void WeightedCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::CensusTransformRowMajor<7,7>>({l.data(), ct_l_.data()}, l.width, l.height);
+	parallel2D<algorithms::CensusTransformRowMajor<7,7>>({r.data(), ct_r_.data()}, r.width, r.height);
+}
+
+void WeightedCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr) {
+	//if (pattern_ == CensusPattern::STANDARD) {
+		parallel2D<algorithms::HCensusTransformRowMajor<7,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+		parallel2D<algorithms::HCensusTransformRowMajor<7,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	//} else if (pattern_ == CensusPattern::GENERALISED) {
+	//	parallel2D<algorithms::HGCensusTransformRowMajor<7,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+	//	parallel2D<algorithms::HGCensusTransformRowMajor<7,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+	//}
+}
+
+void WeightedCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
index fc7be10d3bff600c8656419f933a9b1d63628382..4600b64a26674b5a23a9d77b65b210e618eabd63 100644
--- a/lib/libstereo/src/costs/census.hpp
+++ b/lib/libstereo/src/costs/census.hpp
@@ -4,70 +4,296 @@
 #include <opencv2/core/core.hpp>
 #include "array2d.hpp"
 #include "dsbase.hpp"
+#include <stereo_types.hpp>
 
 #include <cuda_runtime.h>
-
 namespace impl {
-	struct CensusMatchingCost : DSImplBase<unsigned short> {
+	__host__ __device__ static inline uint64_t popcount(const uint64_t bits) {
+		#if defined(__CUDA_ARCH__)
+			return __popcll(bits);
+		#elif defined(_MSC_VER)
+			return __popcnt64(bits);
+		#elif defined(__GNUC__)
+			return __builtin_popcountl(bits);
+		#else
+			static_assert(false, "unsupported compiler (popcount intrinsic)");
+		#endif
+	}
+
+	/**
+	 * Hamming cost, template parameter number of bits
+	 */
+	template<int SIZE>
+	struct HammingCost : DSImplBase<unsigned short> {
+		typedef unsigned short Type;
+
+		HammingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}) {}
+		HammingCost() : DSImplBase<Type>({0,0,0,0}) {}
+
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return COST_MAX; }
+			unsigned short c = 0;
+
+			#pragma unroll
+			for (int i = 0; i < WSTEP; i++) {
+				c+= popcount(l(y, x*WSTEP+i) ^ r(y, (x-d)*WSTEP+i));
+			}
+			return c;
+		}
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (SIZE - 1)/(sizeof(uint64_t)*8) + 1;
+		static constexpr Type COST_MAX = SIZE;
+
+		Array2D<uint64_t>::Data l;
+		Array2D<uint64_t>::Data r;
+	};
+
+	template<uint8_t WW, uint8_t WH, int BPP=1>
+	using CensusMatchingCost = HammingCost<WW*WH*BPP>;
+
+	template<uint8_t R, uint8_t NBINS>
+	struct WeightedCensusMatchingCost : DSImplBase<unsigned short> {
+		static_assert(R % 2 == 1, "R must be odd");
+		typedef unsigned short Type;
+
+		WeightedCensusMatchingCost() : DSImplBase<Type>({0,0,0,0}) {}
+		WeightedCensusMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) :
+				DSImplBase<unsigned short>({w,h,dmin,dmax}) {
+
+			const float wbin = float(R)/(2.0f*float(NBINS)); // bin width
+			int step = 0;
+			int i = 0;
+
+			for (int wy = -R/2; wy <= R/2; wy++) {
+			for (int wx = -R/2; wx <= R/2; wx++) {
+					const int bin = floor(sqrtf(wx*wx + wy*wy)/wbin);
+					masks[step][bin] |= uint64_t(1) << i;
+
+					i += 1;
+					if (i % 64 == 0) { step++; i = 0; }
+			}}
+
+			float weight = 1.0;
+			int bin = 0;
+			do {
+				weights[bin] = weight;
+				weight *= W;
+				bin++;
+			} while(bin != NBINS);
+		}
+
+		WeightedCensusMatchingCost<R,NBINS> &operator=(const WeightedCensusMatchingCost<R,NBINS> &c) {
+			ct_l = c.ct_l;
+			ct_r = c.ct_r;
+			W = c.W;
+			memcpy(masks, c.masks, sizeof(masks));
+			memcpy(weights, c.weights, sizeof(weights));
+			return *this;
+		}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			if ((x-d) < 0) { return COST_MAX; }
+			float c = 0;
+
+			#pragma unroll
+			for (int i = 0; i < WSTEP; i++) {
+				# pragma unroll
+				for (int bin = 0; bin < NBINS; bin++) {
+					uint64_t bits = ct_l(y, x*WSTEP+i) ^ ct_r(y, (x-d)*WSTEP+i);
+					bits &= masks[i][bin];
+					c += float(popcount(bits)) * weights[bin];
+				}
+			}
+
+			return round(c);
+		}
+
+		static constexpr int BPP = 1;
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (BPP*R*R - 1)/(sizeof(uint64_t)*8) + 1;
+		static constexpr Type COST_MAX = R*R*BPP;
+
+		uint64_t masks[WSTEP][NBINS] = {0};
+		float weights[NBINS] = {0.0f};
+		float W = 0.75f;
+
+		Array2D<uint64_t>::Data ct_l;
+		Array2D<uint64_t>::Data ct_r;
+	};
+
+	template<uint8_t WW, uint8_t WH, int BPP=1, bool RMASK=true>
+	struct CensusMaskMatchingCost : DSImplBase<unsigned short> {
 		typedef unsigned short Type;
 
-		CensusMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
+		CensusMaskMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
 
 		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
-			if ((x-d) < 0) { return 255; } // TODO: use window size
-			const uint64_t bits =
-				ct_l(y, x) ^ ct_r(y, x-d);
-			#if defined(__CUDA_ARCH__)
-				return __popcll(bits);
-			#elif defined(_MSC_VER)
-				return __popcnt64(bits);
-			#elif defined(__GNUC__)
-				return __builtin_popcountl(bits);
-			#else
-				static_assert(false, "unsupported compiler (popcount intrinsic)");
-			#endif
+			if ((x-d) < 0) { return COST_MAX; }
+			unsigned short c = 0;
+
+			#pragma unroll
+			for (int i = 0; i < WSTEP; i++) {
+				uint64_t mask = mask_l(y, x*WSTEP+i);
+				if (RMASK) {
+					/* intersection of l and r masks */
+					mask &= mask_r(y, x*WSTEP+i);
+				}
+
+				const uint64_t bits =
+					(ct_l(y, x*WSTEP+i) ^ ct_r(y, (x-d)*WSTEP+i)) & mask;
+
+				c += popcount(bits);
+			}
+			return c;
 		}
 
-		static constexpr unsigned short COST_MAX = 64;
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (BPP*WW*WH - 1)/(sizeof(uint64_t)*8) + 1;
+		static constexpr Type COST_MAX = WW*WH*BPP;
 
 		Array2D<uint64_t>::Data ct_l;
 		Array2D<uint64_t>::Data ct_r;
+		Array2D<uint64_t>::Data mask_l;
+		Array2D<uint64_t>::Data mask_r;
 	};
 }
 
-class CensusMatchingCost : public DSBase<impl::CensusMatchingCost> {
+class CensusMatchingCost : public DSBase<impl::CensusMatchingCost<9,7,1>> {
 public:
-	typedef impl::CensusMatchingCost DataType;
+	typedef impl::CensusMatchingCost<9,7,1> DataType;
 	typedef unsigned short Type;
 
 	CensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
 	CensusMatchingCost(int width, int height, int disp_min, int disp_max)
 		: DSBase<DataType>(width, height, disp_min, disp_max),
-			ct_l_(width, height), ct_r_(width,height),
-			//cost_max(wwidth * wheight),
-			ww_(9), wh_(7)
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
 		{
-			if (ww_ != 9 || wh_ != 7) {
-				// TODO: window size paramters (hardcoded) in matching_cost.cpp
-				throw std::exception();
-			}
-			if (ww_*wh_ > 64) {
-				throw std::exception(); // todo: dynamic size
-			}
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
+		}
+
+	inline void setPattern(CensusPattern p) { pattern_ = p; }
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	void set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	CensusPattern pattern_ = CensusPattern::STANDARD;
+};
+
+class MiniCensusMatchingCost : public DSBase<impl::CensusMatchingCost<5,3,1>> {
+public:
+	typedef impl::CensusMatchingCost<5,3,1> DataType;
+	typedef unsigned short Type;
 
+	MiniCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	MiniCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
+		{
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
+		}
+
+	inline void setPattern(CensusPattern p) { pattern_ = p; }
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	void set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	CensusPattern pattern_ = CensusPattern::STANDARD;
+};
+
+class ExpandingCensusMatchingCost : public DSBase<impl::WeightedCensusMatchingCost<9,3>> {
+public:
+	typedef impl::WeightedCensusMatchingCost<9,3> DataType;
+	typedef unsigned short Type;
+
+	ExpandingCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	ExpandingCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
+		{
+			data().ct_l = ct_l_.data();
+			data().ct_r = ct_r_.data();
+		}
+
+	inline void setPattern(CensusPattern p) { pattern_ = p; }
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	//void set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	CensusPattern pattern_ = CensusPattern::STANDARD;
+};
+
+class WeightedCensusMatchingCost : public DSBase<impl::WeightedCensusMatchingCost<7, 3>> {
+public:
+	typedef impl::WeightedCensusMatchingCost<7, 3> DataType;
+	typedef unsigned short Type;
+
+	WeightedCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	WeightedCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
+		{
 			data().ct_l = ct_l_.data();
 			data().ct_r = ct_r_.data();
 		}
 
 	void set(cv::InputArray l, cv::InputArray r);
 	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
-	static constexpr short COST_MAX = 9*7; // TODO: window size paramters (hardcoded) in matching_cost.cpp
+	void set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+};
+
+class CensusMaskMatchingCost : public DSBase<impl::CensusMaskMatchingCost<9,7,1>> {
+public:
+	typedef impl::CensusMaskMatchingCost<9,7,1> DataType;
+	typedef unsigned short Type;
+
+	CensusMaskMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	CensusMaskMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height),
+			mask_l_(width*data().WSTEP, height), mask_r_(width*data().WSTEP,height)
+		{
+			data().ct_l = ct_l_.data();
+			data().ct_r = ct_r_.data();
+			data().mask_l = mask_l_.data();
+			data().mask_r = mask_r_.data();
+		}
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uint64_t>& l, const Array2D<uint64_t>& r);
+
+	void setMask(cv::InputArray l, cv::InputArray r);
+	void setMask(const Array2D<uchar>& l, const Array2D<uchar>& r);
+
+	static constexpr Type COST_MAX = DataType::COST_MAX;
 
 protected:
 	Array2D<uint64_t> ct_l_;
 	Array2D<uint64_t> ct_r_;
-	int ww_;
-	int wh_;
+	Array2D<uint64_t> mask_l_;
+	Array2D<uint64_t> mask_r_;
 };
 
 #endif
diff --git a/lib/libstereo/src/costs/dual.hpp b/lib/libstereo/src/costs/dual.hpp
index 04d0831d2f8f40fcc6ddab2c8457154d6ae060c8..1e0a4774d5318a82742c5691d66aa10525025cae 100644
--- a/lib/libstereo/src/costs/dual.hpp
+++ b/lib/libstereo/src/costs/dual.hpp
@@ -11,29 +11,72 @@ namespace impl {
 	template <typename A, typename B>
 	struct DualCosts : DSImplBase<typename A::Type> {
 		static_assert(std::is_same<typename A::Type, typename B::Type>::value, "Cost types must be the same");
-		typedef unsigned short Type;
+		typedef typename A::Type Type;
 
-		DualCosts(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}),
+		DualCosts(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}),
 			cost_a(w,h,dmin,dmax), cost_b(w,h,dmin,dmax) {}
 
-		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
 			return cost_a(y,x,d) + cost_b(y,x,d);
 		}
 
 		A cost_a;
 		B cost_b;
+		static constexpr Type COST_MAX = A::COST_MAX + B::COST_MAX;
+	};
+
+	template <typename A, int N>
+	struct MultiCosts : DSImplBase<typename A::Type> {
+		typedef typename A::Type Type;
+
+		MultiCosts(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
+			Type cost = 0;
+			#pragma unroll
+			for (int n=0; n<N; ++n) {
+				cost += costs[n](y,x,d);
+			}
+			return cost;
+		}
+
+		A costs[N];
+		static constexpr Type COST_MAX = A::COST_MAX * N;
+	};
+
+	template <typename A, int N>
+	struct MultiCostsWeighted : DSImplBase<typename A::Type> {
+		typedef typename A::Type Type;
+
+		MultiCostsWeighted(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
+			float cost = 0;
+			float pw = 1.0f;
+			#pragma unroll
+			for (int n=0; n<N; ++n) {
+				float w = weights[n](y,x);
+				cost += pw*w*float(costs[n](y,x,d));
+				pw *= 1.0f-w;
+			}
+			return round(cost);
+		}
+
+		A costs[N];
+		Array2D<float>::Data weights[N];
+		static constexpr Type COST_MAX = A::COST_MAX;
 	};
 
 	template <typename A, typename B>
 	struct DualCostsWeighted : DSImplBase<typename A::Type> {
 		static_assert(std::is_same<typename A::Type, typename B::Type>::value, "Cost types must be the same");
-		typedef unsigned short Type;
+		typedef typename A::Type Type;
 
-		DualCostsWeighted(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}),
+		DualCostsWeighted(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}),
 			cost_a(w,h,dmin,dmax), cost_b(w,h,dmin,dmax) {}
 
-		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
-			const float w = weights_l(y,x);
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
+			const float w = max(weights_l(y,x), weights_r(y,x));
 			return cost_a(y,x,d)*w + cost_b(y,x,d)*(1.0f - w);
 		}
 
@@ -41,7 +84,9 @@ namespace impl {
 		B cost_b;
 		Array2D<float>::Data weights_l;
 		Array2D<float>::Data weights_r;
+		static constexpr Type COST_MAX = A::COST_MAX + B::COST_MAX;
 	};
+
 }
 
 
@@ -49,7 +94,7 @@ template <typename A, typename B>
 class DualCosts : public DSBase<impl::DualCosts<typename A::DataType,typename B::DataType>> {
 public:
 	typedef impl::DualCosts<typename A::DataType,typename B::DataType> DataType;
-	typedef unsigned short Type;
+	typedef typename A::DataType::Type Type;
 
 	DualCosts(int width, int height, int disp_min, int disp_max, A &a, B &b)
 		: DSBase<DataType>(width, height, disp_min, disp_max),
@@ -60,13 +105,68 @@ public:
 		this->data().cost_b = cost_b.data();
 	}
 
-	static constexpr short COST_MAX = A::COST_MAX + B::COST_MAX;
+	static constexpr Type COST_MAX = A::COST_MAX + B::COST_MAX;
 
 protected:
 	A &cost_a;
 	B &cost_b;
 };
 
+template <typename A, int N>
+class MultiCosts : public DSBase<impl::MultiCosts<typename A::DataType,N>> {
+public:
+	typedef impl::MultiCosts<typename A::DataType,N> DataType;
+	typedef typename A::DataType::Type Type;
+
+	MultiCosts(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {};
+
+	void add(int n, A &c) {
+		if (n >= N || n < 0) throw std::exception();
+		costs[n] = &c;
+	}
+
+	void set() {
+		for (int n=0; n<N; ++n) {
+			this->data().costs[n] = costs[n]->data();
+		}
+	}
+
+	static constexpr Type COST_MAX = A::COST_MAX * N;
+
+protected:
+	A *costs[N];
+};
+
+template <typename A, int N>
+class MultiCostsWeighted : public DSBase<impl::MultiCostsWeighted<typename A::DataType,N>> {
+public:
+	typedef impl::MultiCostsWeighted<typename A::DataType,N> DataType;
+	typedef typename A::DataType::Type Type;
+
+	MultiCostsWeighted(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {};
+
+	void add(int n, A &c, Array2D<float> &w) {
+		if (n >= N || n < 0) throw std::exception();
+		costs[n] = &c;
+		weights[n] = &w;
+	}
+
+	void set() {
+		for (int n=0; n<N; ++n) {
+			this->data().costs[n] = costs[n]->data();
+			this->data().weights[n] = weights[n]->data();
+		}
+	}
+
+	static constexpr Type COST_MAX = A::COST_MAX;
+
+protected:
+	A *costs[N];
+	Array2D<float> *weights[N];
+};
+
 /**
  * Combine two cost functions with weight. Weight in Array2D<float> same size
  * as input. Weight values assumed in range [0,1] and cost is calculated as
@@ -77,7 +177,7 @@ template <typename A, typename B>
 class DualCostsWeighted : public DSBase<impl::DualCostsWeighted<typename A::DataType,typename B::DataType>> {
 public:
 	typedef impl::DualCostsWeighted<typename A::DataType,typename B::DataType> DataType;
-	typedef unsigned short Type;
+	typedef typename A::DataType::Type Type;
 
 	DualCostsWeighted(int width, int height, int disp_min, int disp_max, A &a, B &b, Array2D<float> &weightsl, Array2D<float> &weightsr)
 		: DSBase<DataType>(width, height, disp_min, disp_max),
@@ -90,7 +190,7 @@ public:
 		this->data().weights_r = weights_r.data();
 	}
 
-	static constexpr short COST_MAX = A::DataType::COST_MAX + B::DataType::COST_MAX;
+	static constexpr Type COST_MAX = A::DataType::COST_MAX + B::DataType::COST_MAX;
 
 protected:
 	A &cost_a;
diff --git a/lib/libstereo/src/costs/sad.cu b/lib/libstereo/src/costs/sad.cu
new file mode 100644
index 0000000000000000000000000000000000000000..41f1e556423f9e9ea5dc32015d44b7b6be40e543
--- /dev/null
+++ b/lib/libstereo/src/costs/sad.cu
@@ -0,0 +1,58 @@
+#include "sad.hpp"
+
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+void calculate_ii(const cv::Mat &in, cv::Mat &out) {
+	out.create(in.size(), CV_32SC1);
+	if (in.type() != CV_8UC3) { throw std::exception(); }
+
+	for (int y = 0; y < in.rows; y++) {
+		int sum = 0;
+		for (int x = 0; x < in.cols; x++) {
+			auto rgb = in.at<cv::Vec3b>(y,x);
+			sum += (rgb[0] + rgb[1] + rgb[2]);
+			if (y == 0) {
+				out.at<int32_t>(y,x) = sum;
+			}
+			else {
+				out.at<int32_t>(y,x) = sum + out.at<int32_t>(y-1,x);
+			}
+		}
+	}
+}
+
+void SADMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	cv::Mat iil;
+	cv::Mat iir;
+	cv::Mat iml;
+	cv::Mat imr;
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		l.getGpuMat().download(iml);
+		r.getGpuMat().download(imr);
+	}
+	else if (l.isMat() && r.isMat()) {
+		iml = l.getMat();
+		imr = r.getMat();
+	}
+	calculate_ii(iml, iil);
+	calculate_ii(imr, iir);
+
+	l_.create(l.cols(), l.rows());
+	r_.create(r.cols(), r.rows());
+
+	#if USE_GPU
+	l_.toGpuMat().upload(iil);
+	r_.toGpuMat().upload(iir);
+	#else
+	iil.copyTo(l_.toMat());
+	iir.copyTo(r_.toMat());
+	#endif
+
+	data().l = l_.data();
+	data().r = r_.data();
+}
diff --git a/lib/libstereo/src/costs/sad.hpp b/lib/libstereo/src/costs/sad.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ace645a7ce3e456950885a1d808ac1882ca4a36d
--- /dev/null
+++ b/lib/libstereo/src/costs/sad.hpp
@@ -0,0 +1,69 @@
+/**
+ * Sum of absolute differences matching cost implemented using integral images.
+ */
+
+#ifndef _FTL_LIBSTEREO_COSTS_SAD_HPP_
+#define _FTL_LIBSTEREO_COSTS_SAD_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	struct SADMatchingCost : DSImplBase<float> {
+		typedef float Type;
+
+		SADMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<float>({w,h,dmin,dmax}), ww(0), wh(0) {}
+
+		__host__ __device__ inline float operator()(const int y, const int x, const int d) const {
+			if ((x-d-ww-1) < 0) { return COST_MAX; }
+			if ((x+ww) >= width) { return COST_MAX; }
+			if ((y-wh-1) < 0) { return COST_MAX; }
+			if ((y+wh) >= height) { return COST_MAX; }
+
+			int wl = l(y+wh,x+ww) + l(y-wh-1,x-ww-1)
+						- l(y+wh, x-ww-1) - l(y-wh-1, x+ww);
+
+			int wr = r(y+wh,x-d+ww) + r(y-wh-1,x-d-ww-1)
+						- r(y+wh, x-d-ww-1) - r(y-wh-1, x-d+ww);
+
+			return abs(wl - wr)/float((2*ww+1)*(2*wh+1));
+		}
+
+		static constexpr Type COST_MAX = 255*3;
+
+		Array2D<int32_t>::Data l;
+		Array2D<int32_t>::Data r;
+		uchar ww;
+		uchar wh;
+	};
+}
+
+class SADMatchingCost : public DSBase<impl::SADMatchingCost> {
+public:
+	typedef impl::SADMatchingCost DataType;
+	typedef float Type;
+
+	SADMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	SADMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {
+			setWindow(0, 0);
+	}
+
+	void set(cv::InputArray l, cv::InputArray r);
+
+	void setWindow(int w, int h) {
+		data().ww = w/2;
+		data().wh = h/2;
+	}
+
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<int32_t> l_;
+	Array2D<int32_t> r_;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/scale.hpp b/lib/libstereo/src/costs/scale.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ae4c44940ccd042ec9e228ec297e881c93f12d9
--- /dev/null
+++ b/lib/libstereo/src/costs/scale.hpp
@@ -0,0 +1,157 @@
+#ifndef _FTL_LIBSTEREO_COSTS_EXP_HPP_
+#define _FTL_LIBSTEREO_COSTS_EXP_HPP_
+
+#include "array2d.hpp"
+#include "dsbase.hpp"
+
+#include <cuda_runtime.h>
+
+namespace impl {
+	template <typename A>
+	struct ExpCost : DSImplBase<float> {
+		typedef float Type;
+
+		ExpCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<float>({w,h,dmin,dmax}),
+			cost_a(w,h,dmin,dmax) {}
+
+		__host__ __device__ inline float operator()(const int y, const int x, const int d) const {
+			return 1.0f-expf(-cost_a(y,x,d)*l);
+		}
+
+		A cost_a;
+		float l=1.0;
+		static constexpr Type COST_MAX = A::COST_MAX;
+	};
+
+	template <typename A>
+	struct LinearCost : DSImplBase<float> {
+		typedef float Type;
+
+		LinearCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<float>({w,h,dmin,dmax}),
+			cost_a(w,h,dmin,dmax) {}
+
+		__host__ __device__ inline float operator()(const int y, const int x, const int d) const {
+			return cost_a(y,x,d)*s*a;
+		}
+
+		A cost_a;
+		float a=1.0;
+		float s=1.0;
+
+		static constexpr Type COST_MAX = A::COST_MAX;
+	};
+
+	template <typename A, typename T>
+	struct WeightedCost : DSImplBase<T> {
+		typedef T Type;
+
+		WeightedCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}),
+			cost(w,h,dmin,dmax) {}
+
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
+			const float w = max(weights_l(y,x), weights_r(y,x));
+			return round(cost(y,x,d)*w);
+		}
+
+		A cost;
+		Array2D<float>::Data weights_l;
+		Array2D<float>::Data weights_r;
+		static constexpr Type COST_MAX = A::COST_MAX;
+	};
+}
+
+template <typename A>
+class ExpCost : public DSBase<impl::ExpCost<typename A::DataType>> {
+public:
+	typedef impl::ExpCost<typename A::DataType> DataType;
+	typedef float Type;
+
+	ExpCost(int width, int height, int disp_min, int disp_max, A &a, float l)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost_a(a) {};
+
+	void set() {
+		this->data().cost_a = cost_a.data();
+	}
+
+	void set(float l) {
+		this->data().cost_a = cost_a.data();
+		this->data().l = l;
+	}
+
+	static constexpr float COST_MAX = A::COST_MAX;
+
+protected:
+	A &cost_a;
+};
+
+template <typename A>
+class LinearCost : public DSBase<impl::LinearCost<typename A::DataType>> {
+public:
+	typedef impl::LinearCost<typename A::DataType> DataType;
+	typedef float Type;
+
+	LinearCost(int width, int height, int disp_min, int disp_max, A &a, float l)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost_a(a) {
+
+		this->data().s = l;
+		this->data().a = 1.0f/COST_MAX;
+	}
+
+	void set() {
+		this->data().cost_a = cost_a.data();
+	}
+
+	void set(float s) {
+		this->data().cost_a = cost_a.data();
+		this->data().s = s;
+	}
+
+	static const typename A::Type COST_MAX = A::COST_MAX;
+
+protected:
+	A &cost_a;
+};
+
+
+template <typename A, typename T=unsigned short>
+class WeightedCost : public DSBase<impl::WeightedCost<typename A::DataType, T>> {
+public:
+	typedef impl::WeightedCost<typename A::DataType, T> DataType;
+	typedef T Type;
+
+	WeightedCost(int width, int height, int disp_min, int disp_max, A &a, Array2D<float> &wl, Array2D<float> &wr)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost(&a), weights_l(&wl), weights_r(&wr) {
+
+	}
+
+	WeightedCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost(nullptr), weights_l(nullptr), weights_r(nullptr) {
+
+	}
+
+	void setCost(A &c) { cost = &c; }
+
+	void setWeights(Array2D<float> &wl, Array2D<float> &wr) {
+		weights_l = &wl;
+		weights_r = &wr;
+	}
+
+	void set() {
+		this->data().cost = cost->data();
+		this->data().weights_l = weights_l->data();
+		this->data().weights_r = weights_r->data();
+	}
+
+	static const T COST_MAX = A::COST_MAX;
+
+protected:
+	Array2D<float> *weights_l;
+	Array2D<float> *weights_r;
+	A *cost;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/stable.cu b/lib/libstereo/src/costs/stable.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25111afb3b2004b555925de6fecb0ecda0cf517c
--- /dev/null
+++ b/lib/libstereo/src/costs/stable.cu
@@ -0,0 +1,130 @@
+#include "stable.hpp"
+#include "census.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <random>
+#include <chrono>
+
+#include <cuda_runtime.h>
+
+namespace algorithms {
+	template<int NBITS>
+	struct Stable {
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+			int32_t accumulator[NBITS] = {0};
+			uint16_t i = 0;
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int16_t value = im(min(height,max(0,int(float(y)*scaleY) + wy)), min(width,max(0,int(float(x)*scaleX) + wx)));
+					const int16_t filter = filter_mask(0, i++);
+					const int16_t sign = filter > 0 ? 1 : -1;
+					// NOTE: indexing starts from 1
+					const int16_t index = int(abs(filter)) - 1;
+					accumulator[index] += sign*value;
+				}
+			}
+
+			for (i = 0; i < NBITS;) {
+				// zero if first value, otherwise shift to left
+				if (i % 64 == 0) { *out = 0; }
+				else             { *out = (*out << 1); }
+				*out |= ((accumulator[i] > 0) ? 1 : 0);
+
+				i += 1;
+				// if all bits set, continue to next element
+				if (i % 64 == 0) { out++; }
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		const Array2D<uchar>::Data im;
+		const Array2D<int16_t>::Data filter_mask;
+		Array2D<uint64_t>::Data out;
+
+		const int WINX;
+		const int WINY;
+		float scaleX;
+		float scaleY;
+		const int width;
+		const int height;
+
+		// number of uint64_t values for each window
+		const int WSTEP = (NBITS - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+}
+
+#include <iostream>
+void StableMatchingCost::generateFilterMask(const int wsize, const int bits) {
+	if (bits > 127) {
+		// TODO: hardcoded in HammingCost template parameters
+		throw std::exception();
+	}
+	cv::Mat mask(cv::Size(wsize*wsize, 1), CV_16SC1, cv::Scalar(0));
+	if (!mask.isContinuous()) { throw std::exception(); }
+
+	unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+	std::default_random_engine generator(seed);
+	std::uniform_int_distribution<int16_t> distribution(1, bits*2);
+
+	for (int i = 0; i < mask.total(); i++) {
+		// index from 1 to bits (instead of 0 to bits-1) value truncated if
+		// outside window
+		int16_t val = distribution(generator);
+		if (val <= bits) {
+			mask.at<int16_t>(i) = ((i + val) > mask.total()) ? bits - 1 : val;
+		}
+		else {
+			val = -(val - bits);
+			mask.at<int16_t>(i) = ((i + val) < 0) ? 0 : val;
+		}
+	}
+
+	wsize_ = wsize;
+	filter_mask_.create(wsize*wsize, 1);
+	filter_mask_.toGpuMat().upload(mask);
+}
+
+void StableMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::Stable<16>>({l.data(), filter_mask_.data(), stable_l_.data(), wsize_, wsize_, 1.0f, 1.0f, l.width, l.height}, l.width, l.height);
+	parallel2D<algorithms::Stable<16>>({r.data(), filter_mask_.data(), stable_r_.data(), wsize_, wsize_, 1.0f, 1.0f, r.width, r.height}, r.width, r.height);
+}
+
+void StableMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, size_t w, size_t h) {
+	float scaleX = float(l.width) / float(w);
+	float scaleY = float(l.height) / float(h);
+	parallel2D<algorithms::Stable<16>>({l.data(), filter_mask_.data(), stable_l_.data(), wsize_, wsize_, scaleX, scaleY, l.width, l.height}, w, h);
+	parallel2D<algorithms::Stable<16>>({r.data(), filter_mask_.data(), stable_r_.data(), wsize_, wsize_, scaleX, scaleY, r.width, r.height}, w, h);
+}
+
+void StableMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
diff --git a/lib/libstereo/src/costs/stable.hpp b/lib/libstereo/src/costs/stable.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7da102753505e668ec2e6dccd08af15cf92d8c24
--- /dev/null
+++ b/lib/libstereo/src/costs/stable.hpp
@@ -0,0 +1,41 @@
+#ifndef _FTL_LIBSTEREO_COSTS_STABLE_HPP_
+#define _FTL_LIBSTEREO_COSTS_STABLE_HPP_
+
+#include "census.hpp"
+
+/**
+ * STABLE descriptor matching cost
+ */
+class StableMatchingCost : public DSBase<impl::HammingCost<16>> {
+public:
+	typedef impl::HammingCost<16> DataType;
+	typedef DataType::Type Type;
+
+	StableMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	StableMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			wsize_(0), filter_mask_(0, 0),
+			stable_l_(width*data().WSTEP, height),
+			stable_r_(width*data().WSTEP, height)
+		{
+			data().l = stable_l_.data();
+			data().r = stable_r_.data();
+		}
+
+	void generateFilterMask(const int wsize, const int bits);
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r, size_t w, size_t h);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+	Array2D<int16_t> &getFilter() { return filter_mask_; }
+	void setFilter(const Array2D<int16_t> &f) { filter_mask_ = f; }
+
+protected:
+	int wsize_;
+	Array2D<int16_t> filter_mask_;
+	Array2D<uint64_t> stable_l_;
+	Array2D<uint64_t> stable_r_;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/tcensus.cu b/lib/libstereo/src/costs/tcensus.cu
new file mode 100644
index 0000000000000000000000000000000000000000..450ff8047e6f6e8e5b177249992c738d8f8fc25a
--- /dev/null
+++ b/lib/libstereo/src/costs/tcensus.cu
@@ -0,0 +1,84 @@
+#include "tcensus.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+namespace algorithms {
+	template<int WINX, int WINY>
+	struct TCensusTransform {
+
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+
+			static_assert(BPP == 2, "2 bits per pixel expected");
+			short center = im(y, x);
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int y_ = y + wy;
+					const int x_ = x + wx;
+
+					// If fist value, set zero. Otherwise shift left by BPP
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << BPP); }
+
+					if 		(center+t > im(y_,x_))	{ *out |= 1;	/* 01 */ }
+					else if (center-t < im(y_,x_))	{ *out |= 2;	/* 10 */ }
+					else 							{				/* 00 */ }
+
+					i += BPP;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+		short t; // intensity threshold
+
+		// Bits per pixel (for each census feature). Must be 1 or power of 2 for
+		// window() to be correct (for tri-census)!
+		static constexpr int BPP = 2;
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (BPP*WINX*WINY-1)/(sizeof(uint64_t)*8) + 1;
+	};
+}
+
+void TCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::TCensusTransform<9,7>>({l.data(), ct_l_.data(), t_}, l.width, l.height);
+	parallel2D<algorithms::TCensusTransform<9,7>>({r.data(), ct_r_.data(), t_}, r.width, r.height);
+}
+
+void TCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
diff --git a/lib/libstereo/src/costs/tcensus.hpp b/lib/libstereo/src/costs/tcensus.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74000c06b86adcad616199c8f300efc5048a9b8b
--- /dev/null
+++ b/lib/libstereo/src/costs/tcensus.hpp
@@ -0,0 +1,37 @@
+#ifndef _FTL_LIBSTEREO_COSTS_TCENSUS_HPP_
+#define _FTL_LIBSTEREO_COSTS_TCENSUS_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+#include "census.hpp"
+
+#include <cuda_runtime.h>
+
+class TCensusMatchingCost : public DSBase<impl::CensusMatchingCost<9,7,2>> {
+public:
+	typedef impl::CensusMatchingCost<9,7,2> DataType;
+	typedef unsigned short Type;
+
+	TCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	TCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height), t_(0)
+		{
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
+		}
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	void setT(uchar t) { t_ = t; }
+
+	static constexpr Type  COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	uchar t_;
+};
+
+#endif
diff --git a/lib/libstereo/src/dsi_tools.cu b/lib/libstereo/src/dsi_tools.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19c9388fb1425c606c1593a9efafee85ebd19e06
--- /dev/null
+++ b/lib/libstereo/src/dsi_tools.cu
@@ -0,0 +1,32 @@
+#include <opencv2/core/cuda/common.hpp>
+#include "dsi_tools.hpp"
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+
+static void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
+	double dmin, dmax;
+	cv::minMaxLoc(disp, &dmin, &dmax);
+	cv::Mat dispf, dispc;
+	disp.convertTo(dispf, CV_32FC1);
+	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
+
+	#if OPENCV_VERSION >= 40102
+	cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
+	#else
+	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+	#endif
+}
+
+void show_dsi_slice(cv::InputArray in) {
+    cv::Mat dsitmp;
+
+    if (in.isGpuMat()) {
+        in.getGpuMat().download(dsitmp);
+    } else {
+        dsitmp = in.getMat();
+    }
+	float scale = 1280.0f / float(dsitmp.cols);
+	cv::resize(dsitmp, dsitmp, cv::Size(dsitmp.cols*scale, dsitmp.rows*scale));
+	colorize(dsitmp,dsitmp);
+	cv::imshow("DSI Slice", dsitmp);
+}
diff --git a/lib/libstereo/src/dsi_tools.hpp b/lib/libstereo/src/dsi_tools.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be27e02c43d3aabcc0f3fbe5435ccf4f9eb91d74
--- /dev/null
+++ b/lib/libstereo/src/dsi_tools.hpp
@@ -0,0 +1,39 @@
+#ifndef _FTL_LIBSTEREO_DSI_TOOLS_HPP_
+#define _FTL_LIBSTEREO_DSI_TOOLS_HPP_
+
+#include "util.hpp"
+#include "array2d.hpp"
+
+namespace algorithms {
+    template <typename DSI>
+    struct SliceDisparity {
+        typename DSI::DataType in;
+        typename Array2D<float>::Data disparity;
+        typename Array2D<typename DSI::Type>::Data out;
+
+        __cuda__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+            for (int y=thread.y; y<size.y; y+=stride.y) {
+                for (int x=thread.x; x<size.x; x+=stride.x) {
+                    auto d = disparity(y,x);
+                    out(y,x) = (d >= in.disp_min && d <= in.disp_max) ? in(y,x,d) : 255;
+                }
+            }
+        }
+    };
+}
+
+template <typename DSI>
+void dsi_slice(const DSI &in, int d, Array2D<typename DSI::Type> &out) {
+
+}
+
+/* Extract cost image from DSI using the disparity map as index. */
+template <typename DSI>
+void dsi_slice(const DSI &in, Array2D<float> &disp, Array2D<typename DSI::Type> &out) {
+    algorithms::SliceDisparity<DSI> sd = {in.data(), disp.data(), out.data()};
+    parallel2D(sd, in.width(), in.height());
+}
+
+void show_dsi_slice(cv::InputArray in);
+
+#endif
\ No newline at end of file
diff --git a/lib/libstereo/src/stereo_census_adaptive.cu b/lib/libstereo/src/stereo_census_adaptive.cu
index d0c6f105c4578d17275a90503b47b2477063bbd1..79db2fe9299c2c67375c1557b74cf3fd03f63319 100644
--- a/lib/libstereo/src/stereo_census_adaptive.cu
+++ b/lib/libstereo/src/stereo_census_adaptive.cu
@@ -57,8 +57,8 @@ struct StereoCensusAdaptive::Impl {
 	Array2D<unsigned short> uncertainty;
 	Array2D<float> disparity_r;
 	Array2D<uchar> l;
-    Array2D<uchar> r;
-    Array2D<uchar> penalty;
+	Array2D<uchar> r;
+	Array2D<unsigned short> penalty;
 
 	PathAggregator<AdaptivePenaltySGM<CensusMatchingCost::DataType>> aggr;
 	WinnerTakesAll<DSImage16U,float> wta;
@@ -67,8 +67,8 @@ struct StereoCensusAdaptive::Impl {
 		cost(width, height, min_disp, max_disp),
 		cost_min_paths(width, height),
 		uncertainty(width, height),
-        disparity_r(width, height), l(width, height), r(width, height),
-        penalty(width, height) {}
+		disparity_r(width, height), l(width, height), r(width, height),
+		penalty(width, height) {}
 
 };
 
@@ -92,17 +92,17 @@ void StereoCensusAdaptive::compute(cv::InputArray l, cv::InputArray r, cv::Outpu
 	impl_->cost.set(impl_->l, impl_->r);
 
 	cudaSafeCall(cudaDeviceSynchronize());
-    if (params.debug) { timer_print("census transform"); }
-    
-    impl_->penalty.toGpuMat().setTo(params.P1*4);
-    auto canny = cv::cuda::createCannyEdgeDetector(30,120);
-    cv::cuda::GpuMat edges;
-    canny->detect(impl_->l.toGpuMat(), edges);
-    impl_->penalty.toGpuMat().setTo(int(float(params.P1)*1.5f), edges);
-
-    cv::Mat penalties;
-    impl_->penalty.toGpuMat().download(penalties);
-    cv::imshow("Penalties", penalties);
+	if (params.debug) { timer_print("census transform"); }
+
+	impl_->penalty.toGpuMat().setTo(params.P1*4);
+	auto canny = cv::cuda::createCannyEdgeDetector(30,120);
+	cv::cuda::GpuMat edges;
+	canny->detect(impl_->l.toGpuMat(), edges);
+	impl_->penalty.toGpuMat().setTo(int(float(params.P1)*1.5f), edges);
+
+	cv::Mat penalties;
+	impl_->penalty.toGpuMat().download(penalties);
+	cv::imshow("Penalties", penalties);
 
 	// cost aggregation
 	AdaptivePenaltySGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1};
diff --git a/lib/libstereo/src/stereo_cp_censussgm.cu b/lib/libstereo/src/stereo_cp_censussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf63e6b3802c2d0997a642036eb6abc37ea68a8c
--- /dev/null
+++ b/lib/libstereo/src/stereo_cp_censussgm.cu
@@ -0,0 +1,149 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/highgui.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/census.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/cp_sgm.hpp"
+
+#include "median_filter.hpp"
+#include "dsi_tools.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::ConfidencePriorSGM;
+
+typedef CensusMatchingCost MatchingCost;
+
+struct StereoCPCensusSgm::Impl {
+	MatchingCost cost;
+	Array2D<MatchingCost::Type> cost_min_paths;
+	Array2D<MatchingCost::Type> uncertainty;
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+	Array2D<uchar> conf_prior;
+	Array2D<float> disparity;
+
+	PathAggregator<ConfidencePriorSGM<MatchingCost::DataType>> aggr;
+	WinnerTakesAll<DisparitySpaceImage<MatchingCost::Type>,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+        l(width, height), r(width, height),
+		conf_prior(width, height),
+		disparity(width, height)
+		{}
+
+};
+
+StereoCPCensusSgm::StereoCPCensusSgm() : impl_(nullptr) {
+    impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoCPCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+        impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+        impl_->conf_prior.toGpuMat().setTo(0);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	timer_set();
+
+	// CT
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("census transform"); }
+
+	// cost aggregation
+	ConfidencePriorSGM<MatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), impl_->conf_prior.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, params.subpixel, params.lr_consistency);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+
+	Array2D<MatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+
+	#if USE_GPU
+	auto uncertainty = impl_->uncertainty.toGpuMat();
+	cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+	cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	dsi_slice(out, impl_->wta.disparity, dsitmp_dev);
+	cv::cuda::compare(dsitmp_dev.toGpuMat(), float(params.P2)+2.0f*(float(MatchingCost::COST_MAX)*0.5f), uncertainty, cv::CMP_GT);
+	if (params.filtercosts) impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	#else
+	auto uncertainty = impl_->uncertainty.toMat();
+	cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
+	cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toMat().setTo(0, uncertainty);
+	#endif
+
+	median_filter(impl_->wta.disparity, impl_->disparity.toGpuMat());
+	impl_->wta.disparity.toGpuMat().convertTo(impl_->conf_prior.toGpuMat(), CV_8UC1);
+	impl_->disparity.toGpuMat().download(disparity);
+	if (params.debug) { timer_print("median filter"); }
+
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoCPCensusSgm::~StereoCPCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_sad.cu b/lib/libstereo/src/stereo_sad.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ef390db8f6048d9e0b1f07efdad250f2931091b5
--- /dev/null
+++ b/lib/libstereo/src/stereo_sad.cu
@@ -0,0 +1,93 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/sad.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+
+#include "median_filter.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+
+struct StereoSad::Impl {
+	SADMatchingCost cost;
+	Array2D<float> disparity_r;
+
+	WinnerTakesAll<SADMatchingCost, float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		disparity_r(width, height) {}
+
+};
+
+StereoSad::StereoSad() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoSad::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	timer_set();
+
+	impl_->cost.setWindow(params.wsize, params.wsize);
+	impl_->cost.set(l, r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+
+	impl_->wta(impl_->cost, params.subpixel);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	median_filter(impl_->wta.disparity, disparity);
+	if (params.debug) { timer_print("median filter"); }
+}
+
+StereoSad::~StereoSad() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_varcensussgm.cu b/lib/libstereo/src/stereo_varcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0bea1b7a0c4dc7f08f81c0927d71f4c6c20d6f7f
--- /dev/null
+++ b/lib/libstereo/src/stereo_varcensussgm.cu
@@ -0,0 +1,179 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/cudafilters.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+#include "aggregations/adaptive_penalty.hpp"
+#include "median_filter.hpp"
+#include "dsi_tools.hpp"
+
+#include "costs/census.hpp"
+#include "costs/scale.hpp"
+#include <chrono>
+#include <iostream>
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::AdaptivePenaltySGM;
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+typedef unsigned short CostType;
+typedef WeightedCost<CensusMatchingCost, CostType> MatchingCost;
+
+struct StereoVarCensus::Impl {
+	CensusMatchingCost census;
+	Array2D<float> variance;
+	Array2D<float> variance_r;
+	MatchingCost cost;
+
+	Array2D<CostType> penalty;
+	Array2D<CostType> cost_min;
+	Array2D<CostType> cost_min_paths;
+	Array2D<CostType> uncertainty;
+	Array2D<float> confidence;
+
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Mat prior; // used only to calculate MI
+
+	PathAggregator<AdaptivePenaltySGM<MatchingCost::DataType>> aggr;
+
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		census(width, height, min_disp, max_disp),
+		variance(width, height),
+		variance_r(width, height),
+		cost(width, height, min_disp, max_disp, census, variance, variance_r),
+
+		penalty(width, height),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		l(width, height), r(width, height) {}
+};
+
+StereoVarCensus::StereoVarCensus() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoVarCensus::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	timer_set();
+
+	cv::cuda::GpuMat var_l = impl_->variance.toGpuMat();
+	variance_mask(impl_->l.toGpuMat(), var_l, params.var_window);
+	cv::cuda::GpuMat var_r = impl_->variance_r.toGpuMat();
+	variance_mask(impl_->r.toGpuMat(), var_r, params.var_window);
+
+	cv::cuda::normalize(var_l, var_l, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+	cv::cuda::normalize(var_r, var_r, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+	impl_->census.set(impl_->l, impl_->r);
+	impl_->cost.set();
+
+	if (params.debug) { timer_print("Matching cost"); }
+
+	cudaSafeCall(cudaDeviceSynchronize());
+
+	auto penalty = impl_->penalty.toGpuMat();
+	penalty.setTo(params.P2);
+
+	AdaptivePenaltySGM<MatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), short(params.P1)};
+
+	impl_->aggr.getDirectionData(AggregationDirections::LEFTRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::RIGHTLEFT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::UPDOWN).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::DOWNUP).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::TOPLEFTBOTTOMRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::BOTTOMRIGHTTOPLEFT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::BOTTOMLEFTTOPRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::TOPRIGHTBOTTOMLEFT).penalties = impl_->penalty;
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, params.subpixel, params.lr_consistency);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	#if USE_GPU
+	auto uncertainty = impl_->uncertainty.toGpuMat();
+	cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+	cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	#else
+	auto uncertainty = impl_->uncertainty.toMat();
+	cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
+	cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toMat().setTo(0, uncertainty);
+	#endif
+
+	median_filter(impl_->wta.disparity, disparity);
+	if (params.debug) { timer_print("median filter"); }
+
+	Array2D<MatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(out, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoVarCensus::~StereoVarCensus() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_misgm2.cu b/lib/libstereo/src/stereo_wadcensus.cu
similarity index 61%
rename from lib/libstereo/src/stereo_misgm2.cu
rename to lib/libstereo/src/stereo_wadcensus.cu
index f2fba6f81107bc1359af30a22ab0271954b83cee..4787894f9d960a8dbdebfa81af64736832736885 100644
--- a/lib/libstereo/src/stereo_misgm2.cu
+++ b/lib/libstereo/src/stereo_wadcensus.cu
@@ -7,7 +7,6 @@
 #include "stereo.hpp"
 
 #include "util_opencv.hpp"
-#include "costs/mutual_information.hpp"
 #include "dsi.hpp"
 
 #include "wta.hpp"
@@ -16,38 +15,17 @@
 #include "aggregations/adaptive_penalty.hpp"
 #include "median_filter.hpp"
 
-#include "costs/dual.hpp"
 #include "costs/census.hpp"
-
-#ifdef __GNUG__
+#include "costs/sad.hpp"
+#include "costs/dual.hpp"
+#include "costs/scale.hpp"
 
 #include <chrono>
 #include <iostream>
 
-static std::chrono::time_point<std::chrono::system_clock> start;
-
-static void timer_set() {
-		start = std::chrono::high_resolution_clock::now();
-}
-
-static void timer_print(const std::string &msg, const bool reset=true) {
-	auto stop = std::chrono::high_resolution_clock::now();
-
-	char buf[24];
-	snprintf(buf, sizeof(buf), "%5i ms  ",
-				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
-
-	std::cout << buf <<  msg << "\n" << std::flush;
-	if (reset) { timer_set(); }
-}
-
-#else
-
 static void timer_set() {}
 static void timer_print(const std::string &msg, const bool reset=true) {}
 
-#endif
-
 using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::AdaptivePenaltySGM;
@@ -83,17 +61,21 @@ static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
 	else { throw std::exception(); /* todo CPU version */ }
 }
 
-struct StereoMiSgm2::Impl {
+typedef DualCostsWeighted<LinearCost<CensusMatchingCost>, ExpCost<SADMatchingCost>> MatchingCost;
+
+struct StereoWADCensus::Impl {
 	CensusMatchingCost census;
-	MutualInformationMatchingCost mi;
+	LinearCost<CensusMatchingCost> exp_census;
+	SADMatchingCost sad;
+	ExpCost<SADMatchingCost> exp_sad;
 	Array2D<float> variance;
 	Array2D<float> variance_r;
-	DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost> cost;
+	MatchingCost cost;
 
-	Array2D<unsigned char> penalty;
-	Array2D<unsigned short> cost_min;
-	Array2D<unsigned short> cost_min_paths;
-	Array2D<unsigned short> uncertainty;
+	Array2D<float> penalty;
+	Array2D<float> cost_min;
+	Array2D<float> cost_min_paths;
+	Array2D<float> uncertainty;
 	Array2D<float> confidence;
 	Array2D<float> disparity_r;
 	Array2D<uchar> l;
@@ -101,15 +83,19 @@ struct StereoMiSgm2::Impl {
 
 	Mat prior; // used only to calculate MI
 
-	PathAggregator<AdaptivePenaltySGM<DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost>::DataType>> aggr;
-	WinnerTakesAll<DSImage16U,float> wta;
+	PathAggregator<AdaptivePenaltySGM<MatchingCost::DataType>> aggr;
+
+	WinnerTakesAll<DSImageFloat,float> wta;
 
 	Impl(int width, int height, int min_disp, int max_disp) :
 		census(width, height, min_disp, max_disp),
-		mi(width, height, min_disp, max_disp),
+		exp_census(width, height, min_disp, max_disp, census, 1.0),
+		sad(width, height, min_disp, max_disp),
+		exp_sad(width, height, min_disp, max_disp, sad, 1.0),
+
 		variance(width, height),
 		variance_r(width, height),
-		cost(width, height, min_disp, max_disp, census, mi, variance, variance_r),
+		cost(width, height, min_disp, max_disp, exp_census, exp_sad, variance, variance_r),
 		penalty(width, height),
 		cost_min(width, height),
 		cost_min_paths(width, height),
@@ -119,25 +105,11 @@ struct StereoMiSgm2::Impl {
 		l(width, height), r(width, height) {}
 };
 
-StereoMiSgm2::StereoMiSgm2() : impl_(nullptr) {
+StereoWADCensus::StereoWADCensus() : impl_(nullptr) {
 	impl_ = new Impl(0, 0, 0, 0);
 }
 
-void StereoMiSgm2::setPrior(cv::InputArray prior) {
-	if (prior.rows() != impl_->cost.height() || prior.cols() != impl_->cost.width()) {
-		return;
-	}
-	if (prior.isGpuMat()) {
-		prior.getGpuMat().download(impl_->prior);
-	}
-	else {
-		prior.getMat().copyTo(impl_->prior);
-	}
-}
-
-#include <opencv2/highgui.hpp>
-
-void StereoMiSgm2::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+void StereoWADCensus::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
 	cudaSetDevice(0);
 
 	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
@@ -145,37 +117,25 @@ void StereoMiSgm2::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray d
 		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
 	}
 
-	if (impl_->prior.empty() || impl_->prior.size() != l.size()) {
-		// if prior disparity is missing, use random values from uniform
-		// distribution
-		impl_->prior.create(l.rows(), l.cols(), CV_32FC1);
-		cv::randu(impl_->prior, params.d_min, params.d_max);
-	}
-
 	mat2gray(l, impl_->l);
 	mat2gray(r, impl_->r);
 
 	timer_set();
 
 	cv::cuda::GpuMat var_l = impl_->variance.toGpuMat();
-	variance_mask(impl_->l.toGpuMat(), var_l, 17);
+	variance_mask(impl_->l.toGpuMat(), var_l, params.var_window);
 	cv::cuda::GpuMat var_r = impl_->variance_r.toGpuMat();
-	variance_mask(impl_->r.toGpuMat(), var_r, 17);
+	variance_mask(impl_->r.toGpuMat(), var_r, params.var_window);
 
 	cv::cuda::normalize(var_l, var_l, params.alpha, params.beta, cv::NORM_MINMAX, -1);
 	cv::cuda::normalize(var_r, var_r, params.alpha, params.beta, cv::NORM_MINMAX, -1);
 
-	if (l.isMat()) {
-		impl_->mi.set(l.getMat(), r.getMat(), impl_->prior);
-	}
-	else if (l.isGpuMat()) {
-		Mat l_;
-		Mat r_;
-		l.getGpuMat().download(l_);
-		r.getGpuMat().download(r_);
-		impl_->mi.set(l_, r_, impl_->prior);
-	}
-	impl_->census.set( impl_->l, impl_->r);
+	impl_->census.set(impl_->l, impl_->r);
+	impl_->exp_census.set(params.l1);
+
+	impl_->sad.setWindow(params.wsize, params.wsize);
+	impl_->sad.set(l, r);
+	impl_->exp_sad.set(params.l2);
 	impl_->cost.set();
 	if (params.debug) { timer_print("Matching cost"); }
 
@@ -184,13 +144,8 @@ void StereoMiSgm2::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray d
 	auto penalty = impl_->penalty.toGpuMat();
 	penalty.setTo(params.P2);
 
-	/*cv::cuda::GpuMat mask(penalty.size(), CV_8UC1);
-	cv::cuda::compare(var_l, 0.25, mask, cv::CMP_LT);
-	penalty.setTo(params.P2*2, mask);
-	cv::cuda::compare(var_l, 0.75, mask, cv::CMP_GT);
-	penalty.setTo(params.P1, mask);*/
+	AdaptivePenaltySGM<MatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1};
 
-	AdaptivePenaltySGM<DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost>::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1};
 	impl_->aggr.getDirectionData(AggregationDirections::LEFTRIGHT).penalties = impl_->penalty;
 	impl_->aggr.getDirectionData(AggregationDirections::RIGHTLEFT).penalties = impl_->penalty;
 	impl_->aggr.getDirectionData(AggregationDirections::UPDOWN).penalties = impl_->penalty;
@@ -212,7 +167,7 @@ void StereoMiSgm2::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray d
 	if (params.debug) { timer_print("median filter"); }
 }
 
-StereoMiSgm2::~StereoMiSgm2() {
+StereoWADCensus::~StereoWADCensus() {
 	if (impl_) {
 		delete impl_;
 		impl_ = nullptr;
diff --git a/lib/libstereo/src/stereo_censussgm.cu b/lib/libstereo/src/stereo_wcensussgm.cu
similarity index 62%
rename from lib/libstereo/src/stereo_censussgm.cu
rename to lib/libstereo/src/stereo_wcensussgm.cu
index c74782d3c511dfd91f7fdfe4bcd29083befa0367..3ea46621f5c7029614d7a7e47f3a2f5c4a4c1ae7 100644
--- a/lib/libstereo/src/stereo_censussgm.cu
+++ b/lib/libstereo/src/stereo_wcensussgm.cu
@@ -15,6 +15,7 @@
 #include "aggregations/standard_sgm.hpp"
 
 #include "median_filter.hpp"
+#include "dsi_tools.hpp"
 
 #ifdef __GNUG__
 
@@ -49,31 +50,32 @@ using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::StandardSGM;
 
-struct StereoCensusSgm::Impl {
-	//DisparitySpaceImage<unsigned short> dsi;
-	CensusMatchingCost cost;
-	Array2D<unsigned short> cost_min_paths;
-	Array2D<unsigned short> uncertainty;
-	Array2D<float> disparity_r;
+typedef WeightedCensusMatchingCost MatchingCost;
+
+struct StereoWCensusSgm::Impl {
+	MatchingCost cost;
+	Array2D<MatchingCost::Type> cost_min_paths;
+	Array2D<MatchingCost::Type> uncertainty;
 	Array2D<uchar> l;
 	Array2D<uchar> r;
 
-	PathAggregator<StandardSGM<CensusMatchingCost::DataType>> aggr;
-	WinnerTakesAll<DSImage16U,float> wta;
+	PathAggregator<StandardSGM<MatchingCost::DataType>> aggr;
+	WinnerTakesAll<DisparitySpaceImage<MatchingCost::Type>,float> wta;
 
 	Impl(int width, int height, int min_disp, int max_disp) :
 		cost(width, height, min_disp, max_disp),
 		cost_min_paths(width, height),
 		uncertainty(width, height),
-		disparity_r(width, height), l(width, height), r(width, height) {}
+		l(width, height), r(width, height)
+		{}
 
 };
 
-StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
+StereoWCensusSgm::StereoWCensusSgm() : impl_(nullptr) {
 	impl_ = new Impl(0, 0, 0, 0);
 }
 
-void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+void StereoWCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
 	cudaSetDevice(0);
 
 	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
@@ -92,13 +94,13 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
 	if (params.debug) { timer_print("census transform"); }
 
 	// cost aggregation
-	StandardSGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
+	StandardSGM<MatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out = impl_->aggr(func, params.paths);
 
 	cudaSafeCall(cudaDeviceSynchronize());
 	if (params.debug) { timer_print("Aggregation"); }
 
-	impl_->wta(out, 0);
+	impl_->wta(out, params.subpixel, params.lr_consistency);
 	cudaSafeCall(cudaDeviceSynchronize());
 	if (params.debug) { timer_print("WTA"); }
 
@@ -108,24 +110,27 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
 	// Lecture Notes in Artificial Intelligence and Lecture Notes in
 	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
 
-	if (disparity.isGpuMat()) {
-		auto uncertainty = impl_->uncertainty.toGpuMat();
-		cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
-		cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-		impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
-	}
-	else {
-		auto uncertainty = impl_->uncertainty.toMat();
-		cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
-		cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-		impl_->wta.disparity.toMat().setTo(0, uncertainty);
-	}
+	#if USE_GPU
+	auto uncertainty = impl_->uncertainty.toGpuMat();
+	cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+	cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	#else
+	auto uncertainty = impl_->uncertainty.toMat();
+	cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
+	cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+	impl_->wta.disparity.toMat().setTo(0, uncertainty);
+	#endif
 
 	median_filter(impl_->wta.disparity, disparity);
 	if (params.debug) { timer_print("median filter"); }
+
+	Array2D<MatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(out, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
 }
 
-StereoCensusSgm::~StereoCensusSgm() {
+StereoWCensusSgm::~StereoWCensusSgm() {
 	if (impl_) {
 		delete impl_;
 		impl_ = nullptr;
diff --git a/lib/libstereo/src/wta.hpp b/lib/libstereo/src/wta.hpp
index b23325ebdba34c06aa7e12e12625cf1d6835158b..bfb51edcf21195704af6c587be811ae791f68014 100644
--- a/lib/libstereo/src/wta.hpp
+++ b/lib/libstereo/src/wta.hpp
@@ -171,7 +171,7 @@ struct WinnerTakesAll {
 	Array2D<TDisp> disparity_right;
 	Array2D<typename DSI::Type> min_cost;
 
-	Array2D<TDisp> &operator()(const DSI& dsi, const int subpixel=0) {
+	Array2D<TDisp> &operator()(const DSI& dsi, const int subpixel=0, const bool lr_consistency=true) {
 		disparity.create(dsi.width(), dsi.height());
 		disparity_right.create(dsi.width(), dsi.height());
 		min_cost.create(dsi.width(), dsi.height());
@@ -193,7 +193,9 @@ struct WinnerTakesAll {
 		algorithms::WTADiagonal<DSI,float> wtadiag = {dsi.data(), disparity_right.data()};
 		parallel2D(wtadiag, dsi.width(), dsi.height());
 
-		parallel2D<algorithms::ConsistencyCheck<float>>({disparity.data(), disparity_right.data()}, dsi.width(), dsi.height());
+		if (lr_consistency) {
+			parallel2D<algorithms::ConsistencyCheck<float>>({disparity.data(), disparity_right.data()}, dsi.width(), dsi.height());
+		}
 		return disparity;
 	}
 };