diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index 28dd9450169064f770542b73577fca43be8c6423..d3922b3c96793ed3816e3304ceed5400d20a0359 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -38,9 +38,10 @@ if (LIBSTEREO_SHARED)
                 src/stereo_gradientstree.cu
                 src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
-                src/stereo_censussgm.cu
+                src/algorithms/censussgm.cu
+                src/algorithms/stablesgm.cu
+                src/algorithms/tcensussgm.cu
                 src/stereo_hier_census.cu
-                src/stereo_tcensussgm.cu
                 src/stereo_wcensussgm.cu
                 src/stereo_census_adaptive.cu
                 src/stereo_cp_censussgm.cu
@@ -54,6 +55,7 @@ if (LIBSTEREO_SHARED)
                 src/costs/tcensus.cu
                 src/costs/gradient.cu
                 src/costs/sad.cu
+                src/costs/stable.cu
                 src/costs/mutual_information.cu
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
@@ -66,9 +68,10 @@ else()
                 src/stereo_gradientstree.cu
                 src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
-                src/stereo_censussgm.cu
+                src/algorithms/censussgm.cu
+                src/algorithms/stablesgm.cu
+                src/algorithms/tcensussgm.cu
                 src/stereo_hier_census.cu
-                src/stereo_tcensussgm.cu
                 src/stereo_wcensussgm.cu
                 src/stereo_census_adaptive.cu
                 src/stereo_cp_censussgm.cu
@@ -82,6 +85,7 @@ else()
                 src/costs/tcensus.cu
                 src/costs/gradient.cu
                 src/costs/sad.cu
+                src/costs/stable.cu
                 src/costs/mutual_information.cu
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index abc4b49d05eea4e76dc83a5974644b26e4321f06..4b4091ffc73d94a9a86bb1f7e7df51ab6c69c5f6 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -85,6 +85,37 @@ private:
 	Impl *impl_;
 };
 
+
+class StereoStableSgm {
+public:
+	StereoStableSgm();
+	~StereoStableSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		short wsize = 17;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+
 class StereoHierCensusSgm {
 public:
 	StereoHierCensusSgm();
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
index 038ab158d831056e0afcdc324e5968a5799b9eea..5b30d3686a82bde965b61e43cb12f7250be52cd1 100644
--- a/lib/libstereo/middlebury/main.cpp
+++ b/lib/libstereo/middlebury/main.cpp
@@ -27,11 +27,23 @@ static void run_censussgm(MiddleburyData &data, cv::Mat &disparity) {
 	stereo.compute(data.imL, data.imR, disparity);
 }
 
+static void run_stablesgm(MiddleburyData &data, cv::Mat &disparity) {
+	auto stereo = StereoStableSgm();
+	stereo.params.P1 = 8;
+	stereo.params.P2 = 24;
+	stereo.params.wsize = 13;
+	stereo.params.d_min = data.calib.vmin;
+	stereo.params.d_max = data.calib.vmax;
+	stereo.params.subpixel = 1;
+	stereo.params.lr_consistency = true;
+	stereo.params.debug = false;
+	stereo.compute(data.imL, data.imR, disparity);
+}
+
 static void run_hcensussgm(MiddleburyData &data, cv::Mat &disparity) {
 	auto stereo = StereoHierCensusSgm();
 	stereo.params.P1 = 24;
 	stereo.params.P2 = 64;
-
 	stereo.params.d_min = data.calib.vmin;
 	stereo.params.d_max = data.calib.vmax;
 	stereo.params.subpixel = 1;
@@ -168,6 +180,7 @@ static const std::map<std::string, std::function<void(MiddleburyData&, cv::Mat&)
 	{ "wcensussgm", run_wcensussgm },
 	//{ "misgm", run_misgm },
 	{ "varcensus", run_varcensus },
+	{ "stablesgm", run_stablesgm },
 	//{ "sad", run_sad },
 };
 
diff --git a/lib/libstereo/src/algorithms/censussgm.cu b/lib/libstereo/src/algorithms/censussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8661ec3a994809e31081cb1e176280aa9cedb9f2
--- /dev/null
+++ b/lib/libstereo/src/algorithms/censussgm.cu
@@ -0,0 +1,39 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+
+struct StereoCensusSgm::Impl : public StereoSgm<CensusMatchingCost, StereoCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoCensusSgm::~StereoCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/stablesgm.cu b/lib/libstereo/src/algorithms/stablesgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..335ee20d8495eb15e87ff2bc68cfb462ae027b93
--- /dev/null
+++ b/lib/libstereo/src/algorithms/stablesgm.cu
@@ -0,0 +1,40 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/stable.hpp"
+
+struct StereoStableSgm::Impl : public StereoSgm<StableMatchingCost, StereoStableSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoStableSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoStableSgm::StereoStableSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoStableSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.generateFilterMask(params.wsize, 127); // hardcoded in implementation
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoStableSgm::~StereoStableSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/stereosgm.hpp b/lib/libstereo/src/algorithms/stereosgm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..114bc501de437a4007165ba7e4a694d52e09a94e
--- /dev/null
+++ b/lib/libstereo/src/algorithms/stereosgm.hpp
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#include "median_filter.hpp"
+#include "dsi_tools.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+template<typename MatchingCostT, typename ParamsT>
+struct StereoSgm {
+	ParamsT &params;
+	MatchingCostT cost;
+	Array2D<typename MatchingCostT::Type> cost_min_paths;
+	Array2D<typename MatchingCostT::Type> uncertainty;
+
+	PathAggregator<ftl::stereo::aggregations::StandardSGM<typename MatchingCostT::DataType>> aggr;
+	WinnerTakesAll<DisparitySpaceImage<typename MatchingCostT::Type>, float> wta;
+
+	StereoSgm(ParamsT &params, int width, int height, int min_disp, int max_disp) :
+		params(params),
+		cost(width, height, min_disp, max_disp),
+		cost_min_paths(width, height),
+		uncertainty(width, height)
+		{}
+
+	void compute(cv::OutputArray disparity) {
+
+		// cost aggregation
+		ftl::stereo::aggregations::StandardSGM<typename MatchingCostT::DataType> func = {cost.data(), cost_min_paths.data(), params.P1, params.P2};
+		auto &out = aggr(func, params.paths);
+
+		cudaSafeCall(cudaDeviceSynchronize());
+		if (params.debug) { timer_print("Aggregation"); }
+
+		wta(out, params.subpixel, params.lr_consistency);
+		cudaSafeCall(cudaDeviceSynchronize());
+		if (params.debug) { timer_print("WTA"); }
+
+		// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+		// Semi-global matching: A principled derivation in terms of
+		// message passing. Lecture Notes in Computer Science (Including Subseries
+		// Lecture Notes in Artificial Intelligence and Lecture Notes in
+		// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+
+		#if USE_GPU
+		cv::cuda::GpuMat uncertainty_gpumat = uncertainty.toGpuMat();
+		cv::cuda::subtract(wta.min_cost.toGpuMat(), cost_min_paths.toGpuMat(), uncertainty_gpumat);
+		cv::cuda::compare(uncertainty_gpumat, params.uniqueness, uncertainty_gpumat, cv::CMP_GT);
+		wta.disparity.toGpuMat().setTo(0, uncertainty_gpumat);
+		#else
+		cv::Mat uncertainty_mat = uncertainty.toMat();
+		cv::subtract(wta.min_cost.toMat(), cost_min_paths.toMat(), uncertainty_mat);
+		cv::compare(uncertainty_mat, params.uniqueness, uncertainty, cv::CMP_GT);
+		wta.disparity.toMat().setTo(0, uncertainty_mat);
+		#endif
+
+		median_filter(wta.disparity, disparity);
+		if (params.debug) { timer_print("median filter"); }
+	}
+};
diff --git a/lib/libstereo/src/algorithms/tcensussgm.cu b/lib/libstereo/src/algorithms/tcensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..131405107a6fe2a92b69cb1c57dc99ff329ae101
--- /dev/null
+++ b/lib/libstereo/src/algorithms/tcensussgm.cu
@@ -0,0 +1,40 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/tcensus.hpp"
+
+struct StereoTCensusSgm::Impl : public StereoSgm<TCensusMatchingCost, StereoTCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoTCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoTCensusSgm::StereoTCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoTCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	timer_set();
+
+	impl_->cost.setT(params.t);
+	impl_->cost.set(impl_->l, impl_->r);
+	impl_->compute(disparity);
+}
+
+StereoTCensusSgm::~StereoTCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
index 6dfb59b15e31d192333383852dd499d06a34d142..24f6364ec21630f9653dc4cbbaceeba2bf3e972d 100644
--- a/lib/libstereo/src/costs/census.hpp
+++ b/lib/libstereo/src/costs/census.hpp
@@ -19,32 +19,38 @@ namespace impl {
 		#endif
 	}
 
-	template<uint8_t WW, uint8_t WH, int BPP=1>
-	struct CensusMatchingCost : DSImplBase<unsigned short> {
+	/**
+	 * Hamming cost, template parameter number of bits
+	 */
+	template<int SIZE>
+	struct HammingCost : DSImplBase<unsigned short> {
 		typedef unsigned short Type;
 
-		CensusMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}) {}
-		CensusMatchingCost() : DSImplBase<unsigned short>({0,0,0,0}) {}
+		HammingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<Type>({w,h,dmin,dmax}) {}
+		HammingCost() : DSImplBase<Type>({0,0,0,0}) {}
 
-		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+		__host__ __device__ inline Type operator()(const int y, const int x, const int d) const {
 			if ((x-d) < 0) { return COST_MAX; }
 			unsigned short c = 0;
 
 			#pragma unroll
 			for (int i = 0; i < WSTEP; i++) {
-				c+= popcount(ct_l(y, x*WSTEP+i) ^ ct_r(y, (x-d)*WSTEP+i));
+				c+= popcount(l(y, x*WSTEP+i) ^ r(y, (x-d)*WSTEP+i));
 			}
 			return c;
 		}
 
 		// number of uint64_t values for each window
-		static constexpr int WSTEP = (BPP*WW*WH - 1)/(sizeof(uint64_t)*8) + 1;
-		static constexpr Type COST_MAX = WW*WH*BPP;
+		static constexpr int WSTEP = (SIZE - 1)/(sizeof(uint64_t)*8) + 1;
+		static constexpr Type COST_MAX = SIZE;
 
-		Array2D<uint64_t>::Data ct_l;
-		Array2D<uint64_t>::Data ct_r;
+		Array2D<uint64_t>::Data l;
+		Array2D<uint64_t>::Data r;
 	};
 
+	template<uint8_t WW, uint8_t WH, int BPP=1>
+	using CensusMatchingCost = HammingCost<WW*WH*BPP>;
+
 	template<uint8_t R, uint8_t NBINS>
 	struct WeightedCensusMatchingCost : DSImplBase<unsigned short> {
 		static_assert(R % 2 == 1, "R must be odd");
@@ -152,8 +158,8 @@ public:
 		: DSBase<DataType>(width, height, disp_min, disp_max),
 			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
 		{
-			data().ct_l = ct_l_.data();
-			data().ct_r = ct_r_.data();
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
 		}
 
 	void set(cv::InputArray l, cv::InputArray r);
diff --git a/lib/libstereo/src/costs/stable.cu b/lib/libstereo/src/costs/stable.cu
new file mode 100644
index 0000000000000000000000000000000000000000..21db97678804887c4d8f3de469045c9eaab3603f
--- /dev/null
+++ b/lib/libstereo/src/costs/stable.cu
@@ -0,0 +1,119 @@
+#include "stable.hpp"
+#include "census.hpp"
+#include "../util.hpp"
+#include "../util_opencv.hpp"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core.hpp>
+
+#include <random>
+#include <chrono>
+
+#include <cuda_runtime.h>
+
+namespace algorithms {
+	template<int NBITS>
+	struct Stable {
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+			int32_t accumulator[NBITS] = {0};
+			uint16_t i = 0;
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int16_t value = im(y + wy, x + wx);
+					const int16_t filter = filter_mask(0, i++);
+					const int16_t sign = filter > 0 ? 1 : -1;
+					// NOTE: indexing starts from 1
+					const int16_t index = int(abs(filter)) - 1;
+					accumulator[index] += sign*value;
+				}
+			}
+
+			for (i = 0; i < NBITS;) {
+				// zero if first value, otherwise shift to left
+				if (i % 64 == 0) { *out = 0; }
+				else             { *out = (*out << 1); }
+				*out |= ((accumulator[i] > 0) ? 1 : 0);
+
+				i += 1;
+				// if all bits set, continue to next element
+				if (i % 64 == 0) { out++; }
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		const Array2D<uchar>::Data im;
+		const Array2D<int16_t>::Data filter_mask;
+		Array2D<uint64_t>::Data out;
+
+		const int WINX;
+		const int WINY;
+
+		// number of uint64_t values for each window
+		const int WSTEP = (NBITS - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
+}
+
+#include <iostream>
+void StableMatchingCost::generateFilterMask(const int wsize, const int bits) {
+	if (bits > 127) {
+		// TODO: hardcoded in HammingCost template parameters
+		throw std::exception();
+	}
+	cv::Mat mask(cv::Size(wsize*wsize, 1), CV_16SC1, cv::Scalar(0));
+	if (!mask.isContinuous()) { throw std::exception(); }
+
+	unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+	std::default_random_engine generator(seed);
+	std::uniform_int_distribution<int16_t> distribution(1, bits*2);
+
+	for (int i = 0; i < mask.total(); i++) {
+		// index from 1 to bits (instead of 0 to bits-1) value truncated if
+		// outside window
+		int16_t val = distribution(generator);
+		if (val <= bits) {
+			mask.at<int16_t>(i) = ((i + val) > mask.total()) ? bits - 1 : val;
+		}
+		else {
+			val = -(val - bits);
+			mask.at<int16_t>(i) = ((i + val) < 0) ? 0 : val;
+		}
+	}
+
+	wsize_ = wsize;
+	filter_mask_.create(wsize*wsize, 1);
+	filter_mask_.toGpuMat().upload(mask);
+}
+
+void StableMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::Stable<127>>({l.data(), filter_mask_.data(), stable_l_.data(), wsize_, wsize_}, l.width, l.height);
+	parallel2D<algorithms::Stable<127>>({r.data(), filter_mask_.data(), stable_r_.data(), wsize_, wsize_}, r.width, r.height);
+}
+
+void StableMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
diff --git a/lib/libstereo/src/costs/stable.hpp b/lib/libstereo/src/costs/stable.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ac2b147760c3f93332b80f632a4eea0e7a506
--- /dev/null
+++ b/lib/libstereo/src/costs/stable.hpp
@@ -0,0 +1,37 @@
+#ifndef _FTL_LIBSTEREO_COSTS_STABLE_HPP_
+#define _FTL_LIBSTEREO_COSTS_STABLE_HPP_
+
+#include "census.hpp"
+
+/**
+ * STABLE descriptor matching cost
+ */
+class StableMatchingCost : public DSBase<impl::HammingCost<127>> {
+public:
+	typedef impl::HammingCost<127> DataType;
+	typedef DataType::Type Type;
+
+	StableMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	StableMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			wsize_(0), filter_mask_(0, 0),
+			stable_l_(width*data().WSTEP, height),
+			stable_r_(width*data().WSTEP, height)
+		{
+			data().l = stable_l_.data();
+			data().r = stable_r_.data();
+		}
+
+	void generateFilterMask(const int wsize, const int bits);
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	int wsize_;
+	Array2D<int16_t> filter_mask_;
+	Array2D<uint64_t> stable_l_;
+	Array2D<uint64_t> stable_r_;
+};
+
+#endif
diff --git a/lib/libstereo/src/costs/tcensus.hpp b/lib/libstereo/src/costs/tcensus.hpp
index 48f67397c3d1dad2d8aa05b08b2586dcdd0ae9b2..74000c06b86adcad616199c8f300efc5048a9b8b 100644
--- a/lib/libstereo/src/costs/tcensus.hpp
+++ b/lib/libstereo/src/costs/tcensus.hpp
@@ -18,8 +18,8 @@ public:
 		: DSBase<DataType>(width, height, disp_min, disp_max),
 			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height), t_(0)
 		{
-			data().ct_l = ct_l_.data();
-			data().ct_r = ct_r_.data();
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
 		}
 
 	void set(cv::InputArray l, cv::InputArray r);
diff --git a/lib/libstereo/src/stereo_censussgm.cu b/lib/libstereo/src/stereo_censussgm.cu
deleted file mode 100644
index 40616800e401f1ce7c83dac3befae940e82181a4..0000000000000000000000000000000000000000
--- a/lib/libstereo/src/stereo_censussgm.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <opencv2/core.hpp>
-#include <opencv2/imgproc.hpp>
-
-#include <opencv2/core/cuda/common.hpp>
-#include <opencv2/cudaarithm.hpp>
-
-#include "stereo.hpp"
-
-#include "util_opencv.hpp"
-#include "costs/census.hpp"
-#include "dsi.hpp"
-
-#include "wta.hpp"
-#include "cost_aggregation.hpp"
-#include "aggregations/standard_sgm.hpp"
-
-#include "median_filter.hpp"
-#include "dsi_tools.hpp"
-
-#ifdef __GNUG__
-
-#include <chrono>
-#include <iostream>
-
-static std::chrono::time_point<std::chrono::system_clock> start;
-
-static void timer_set() {
-		start = std::chrono::high_resolution_clock::now();
-}
-
-static void timer_print(const std::string &msg, const bool reset=true) {
-	auto stop = std::chrono::high_resolution_clock::now();
-
-	char buf[24];
-	snprintf(buf, sizeof(buf), "%5i ms  ",
-				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
-
-	std::cout << buf <<  msg << "\n" << std::flush;
-	if (reset) { timer_set(); }
-}
-
-#else
-
-static void timer_set() {}
-static void timer_print(const std::string &msg, const bool reset=true) {}
-
-#endif
-
-using cv::Mat;
-using cv::Size;
-using ftl::stereo::aggregations::StandardSGM;
-
-typedef CensusMatchingCost MatchingCost;
-
-struct StereoCensusSgm::Impl {
-	MatchingCost cost;
-	Array2D<MatchingCost::Type> cost_min_paths;
-	Array2D<MatchingCost::Type> uncertainty;
-	Array2D<uchar> l;
-	Array2D<uchar> r;
-
-	PathAggregator<StandardSGM<MatchingCost::DataType>> aggr;
-	WinnerTakesAll<DisparitySpaceImage<MatchingCost::Type>,float> wta;
-
-	Impl(int width, int height, int min_disp, int max_disp) :
-		cost(width, height, min_disp, max_disp),
-		cost_min_paths(width, height),
-		uncertainty(width, height),
-		l(width, height), r(width, height)
-		{}
-
-};
-
-StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
-	impl_ = new Impl(0, 0, 0, 0);
-}
-
-void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
-	cudaSetDevice(0);
-
-	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
-		delete impl_; impl_ = nullptr;
-		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
-	}
-
-	mat2gray(l, impl_->l);
-	mat2gray(r, impl_->r);
-	timer_set();
-
-	// CT
-	impl_->cost.set(impl_->l, impl_->r);
-
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("census transform"); }
-
-	// cost aggregation
-	StandardSGM<MatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
-	auto &out = impl_->aggr(func, params.paths);
-
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("Aggregation"); }
-
-	impl_->wta(out, params.subpixel, params.lr_consistency);
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("WTA"); }
-
-	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
-	// Semi-global matching: A principled derivation in terms of
-	// message passing. Lecture Notes in Computer Science (Including Subseries
-	// Lecture Notes in Artificial Intelligence and Lecture Notes in
-	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
-
-	#if USE_GPU
-	auto uncertainty = impl_->uncertainty.toGpuMat();
-	cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
-	cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-	impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
-	#else
-	auto uncertainty = impl_->uncertainty.toMat();
-	cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
-	cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-	impl_->wta.disparity.toMat().setTo(0, uncertainty);
-	#endif
-
-	median_filter(impl_->wta.disparity, disparity);
-	if (params.debug) { timer_print("median filter"); }
-
-	Array2D<MatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
-	dsi_slice(out, impl_->wta.disparity, dsitmp_dev);
-	show_dsi_slice(dsitmp_dev.toGpuMat());
-}
-
-StereoCensusSgm::~StereoCensusSgm() {
-	if (impl_) {
-		delete impl_;
-		impl_ = nullptr;
-	}
-}
diff --git a/lib/libstereo/src/stereo_tcensussgm.cu b/lib/libstereo/src/stereo_tcensussgm.cu
deleted file mode 100644
index 9d95d4ec6ffc0438d3c06e1f599dface3172e8f7..0000000000000000000000000000000000000000
--- a/lib/libstereo/src/stereo_tcensussgm.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <opencv2/core.hpp>
-#include <opencv2/imgproc.hpp>
-
-#include <opencv2/core/cuda/common.hpp>
-#include <opencv2/cudaarithm.hpp>
-
-#include "stereo.hpp"
-
-#include "util_opencv.hpp"
-#include "costs/tcensus.hpp"
-#include "dsi.hpp"
-
-#include "wta.hpp"
-#include "cost_aggregation.hpp"
-#include "aggregations/standard_sgm.hpp"
-
-#include "median_filter.hpp"
-
-#ifdef __GNUG__
-
-#include <chrono>
-#include <iostream>
-
-static std::chrono::time_point<std::chrono::system_clock> start;
-
-static void timer_set() {
-		start = std::chrono::high_resolution_clock::now();
-}
-
-static void timer_print(const std::string &msg, const bool reset=true) {
-	auto stop = std::chrono::high_resolution_clock::now();
-
-	char buf[24];
-	snprintf(buf, sizeof(buf), "%5i ms  ",
-				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
-
-	std::cout << buf <<  msg << "\n" << std::flush;
-	if (reset) { timer_set(); }
-}
-
-#else
-
-static void timer_set() {}
-static void timer_print(const std::string &msg, const bool reset=true) {}
-
-#endif
-
-using cv::Mat;
-using cv::Size;
-using ftl::stereo::aggregations::StandardSGM;
-
-struct StereoTCensusSgm::Impl {
-	//DisparitySpaceImage<unsigned short> dsi;
-	TCensusMatchingCost cost;
-	Array2D<unsigned short> cost_min_paths;
-	Array2D<unsigned short> uncertainty;
-	Array2D<float> disparity_r;
-	Array2D<uchar> l;
-	Array2D<uchar> r;
-
-	PathAggregator<StandardSGM<TCensusMatchingCost::DataType>> aggr;
-	WinnerTakesAll<DSImage16U,float> wta;
-
-	Impl(int width, int height, int min_disp, int max_disp) :
-		cost(width, height, min_disp, max_disp),
-		cost_min_paths(width, height),
-		uncertainty(width, height),
-		disparity_r(width, height), l(width, height), r(width, height) {}
-
-};
-
-StereoTCensusSgm::StereoTCensusSgm() : impl_(nullptr) {
-	impl_ = new Impl(0, 0, 0, 0);
-}
-
-void StereoTCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
-	cudaSetDevice(0);
-
-	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
-		delete impl_; impl_ = nullptr;
-		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
-	}
-
-	mat2gray(l, impl_->l);
-	mat2gray(r, impl_->r);
-	timer_set();
-
-	// CT
-	impl_->cost.setT(params.t);
-	impl_->cost.set(impl_->l, impl_->r);
-
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("census transform"); }
-
-	// cost aggregation
-	StandardSGM<TCensusMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
-	auto &out = impl_->aggr(func, params.paths);
-
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("Aggregation"); }
-
-	impl_->wta(out, params.subpixel, params.lr_consistency);
-	cudaSafeCall(cudaDeviceSynchronize());
-	if (params.debug) { timer_print("WTA"); }
-
-	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
-	// Semi-global matching: A principled derivation in terms of
-	// message passing. Lecture Notes in Computer Science (Including Subseries
-	// Lecture Notes in Artificial Intelligence and Lecture Notes in
-	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
-
-	if (disparity.isGpuMat()) {
-		auto uncertainty = impl_->uncertainty.toGpuMat();
-		cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
-		cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-		impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
-	}
-	else {
-		auto uncertainty = impl_->uncertainty.toMat();
-		cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
-		cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
-		impl_->wta.disparity.toMat().setTo(0, uncertainty);
-	}
-
-	median_filter(impl_->wta.disparity, disparity);
-	if (params.debug) { timer_print("median filter"); }
-}
-
-StereoTCensusSgm::~StereoTCensusSgm() {
-	if (impl_) {
-		delete impl_;
-		impl_ = nullptr;
-	}
-}