diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index 05ce7cc819a385a54732c3020125d40d5988f95a..f1708e4f176c436e10d86c68c23442610df2f026 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -39,6 +39,7 @@ if (LIBSTEREO_SHARED)
                 src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
                 src/algorithms/censussgm.cu
+                src/algorithms/excensussgm.cu
                 src/algorithms/stablesgm.cu
                 src/algorithms/tcensussgm.cu
                 src/algorithms/hcensussgm.cu
@@ -73,6 +74,7 @@ else()
                 src/stereo_gtsgm.cu
                 src/stereo_misgm.cu
                 src/algorithms/censussgm.cu
+                src/algorithms/excensussgm.cu
                 src/algorithms/stablesgm.cu
                 src/algorithms/tcensussgm.cu
                 src/algorithms/hcensussgm.cu
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index 4efde8d5c056709f00ca7a46b823308efe8f08fb..0159991a60f1fc1b6869ba739c75bf35cb211d8b 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -86,6 +86,35 @@ private:
 	Impl *impl_;
 };
 
+class StereoExCensusSgm {
+public:
+	StereoExCensusSgm();
+	~StereoExCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+		CensusPattern pattern = CensusPattern::STANDARD;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
 /**
  * Mean reference pixel. Doesn't work since detail is lost form reference
  *
diff --git a/lib/libstereo/middlebury/algorithms.hpp b/lib/libstereo/middlebury/algorithms.hpp
index b89f798a9358e88b409a747553d833b302faf488..2b73615507ed92a866d7d4b6d8839934657e89ae 100644
--- a/lib/libstereo/middlebury/algorithms.hpp
+++ b/lib/libstereo/middlebury/algorithms.hpp
@@ -32,6 +32,23 @@ namespace Impl {
 		}
 	};
 
+	struct ECensusSGM : public Algorithm {
+		ECensusSGM() { P1 = 4.0f; P2 = 80.0f; }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoExCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
 	struct GCensusSGM : public Algorithm {
 		GCensusSGM() { P1 = 12.0f; P2 = 32.0f; }
 
@@ -286,6 +303,7 @@ static const std::map<std::string, Algorithm*> algorithms = {
 	{ "censussgm", new Impl::CensusSGM() },
 	{ "mcensussgm", new Impl::MeanCensusSGM() },
 	{ "gcensussgm", new Impl::GCensusSGM() },
+	{ "ecensussgm", new Impl::ECensusSGM() },
 	{ "stablesgm", new Impl::StableSGM() },
 	{ "hstablesgm", new Impl::HStableSGM() },
 	{ "brefcensus", new Impl::BRefCensusSGM() },
diff --git a/lib/libstereo/src/algorithms/excensussgm.cu b/lib/libstereo/src/algorithms/excensussgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..78e03617be1f21b884cd3a6906d34899dcdd15d1
--- /dev/null
+++ b/lib/libstereo/src/algorithms/excensussgm.cu
@@ -0,0 +1,44 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/census.hpp"
+
+struct StereoExCensusSgm::Impl : public StereoSgm<ExpandingCensusMatchingCost, StereoExCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoExCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoExCensusSgm::StereoExCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoExCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.setPattern(params.pattern);
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+    impl_->compute(disparity);
+    
+    Array2D<ExpandingCensusMatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(impl_->cost, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
+}
+
+StereoExCensusSgm::~StereoExCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/algorithms/hcensussgm.cu b/lib/libstereo/src/algorithms/hcensussgm.cu
index 7511476781ee74ee5cddba6c03cb5e0b35fab2ef..7cd4c00d799f339a1bdbcadf216661b95ff6b663 100644
--- a/lib/libstereo/src/algorithms/hcensussgm.cu
+++ b/lib/libstereo/src/algorithms/hcensussgm.cu
@@ -116,7 +116,11 @@ void StereoHierCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::Output
     impl_->cost_medium.set(impl_->l, impl_->r, medium_l, medium_r);
     impl_->cost_coarse.set(impl_->l, impl_->r, coarse_l, coarse_r);
     impl_->cost.set();
-	impl_->compute(disparity);
+    impl_->compute(disparity);
+    
+    Array2D<ExpandingCensusMatchingCost::Type> dsitmp_dev(l.cols(), l.rows());
+	dsi_slice(impl_->cost, impl_->wta.disparity, dsitmp_dev);
+	show_dsi_slice(dsitmp_dev.toGpuMat());
 }
 
 StereoHierCensusSgm::~StereoHierCensusSgm() {
diff --git a/lib/libstereo/src/costs/census.cu b/lib/libstereo/src/costs/census.cu
index 6c499c618475d138445dee8c4359324b9e746f8f..4cb39194208296de05ef6b0a6f1cd86c0f1bbbd1 100644
--- a/lib/libstereo/src/costs/census.cu
+++ b/lib/libstereo/src/costs/census.cu
@@ -7,6 +7,9 @@
 
 #include <cuda_runtime.h>
 
+template <typename T>
+__host__ __device__ inline T square(T v) { return v*v; }
+
 namespace algorithms {
 	/** Census transform. Bits stored in row major order. */
 	template<int WINX, int WINY>
@@ -47,6 +50,65 @@ namespace algorithms {
 		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
 	};
 
+	/** Census transform. Bits stored in row major order. This variant uses a
+	 *  nested window that expands with distance from center, where the
+	 *  absolute max difference to center is used from that window. */
+	template<int WINX, int WINY>
+	struct ExCensusTransformRowMajor {
+		__host__ __device__ inline short subwindow(const int x, const int y, int wx, int wy, short center, ushort2 size) {
+			short value = center;
+
+			int nwx = 3*wx; //int(float(wx)*scale);
+			int nwy = 3*wy; //int(float(wy)*scale);
+			int wsize = max(abs(wx),abs(wy));
+			if (x == 100 && y == 100) printf("Ex shape: %d, %d\n",nwx,nwy);
+			for (int dy=-abs(wy); dy<=abs(wy); ++dy) {
+			for (int dx=-abs(wx); dx<=abs(wx); ++dx) {
+				auto v = im(min(size.y, max(0,y+nwy+dy)), min(size.x, max(0,x+nwx+dx)));
+				if (abs(center-v) > abs(center-value)) value = v;
+			}
+			}
+
+			return value;
+		}
+
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out, ushort2 size) {
+			short center = im(y, x);
+			uint8_t i = 0; // bit counter for *out
+
+			for (int wy = -WINY/2; wy <= WINY/2; wy++) {
+				for (int wx = -WINX/2; wx <= WINX/2; wx++) {
+					const int y_ = y + wy;
+					const int x_ = x + wx;
+
+					// zero if first value, otherwise shift to left
+					if (i % 64 == 0) { *out = 0; }
+					else             { *out = (*out << 1); }
+					*out |= (center < (subwindow(x,y,wx,wy,center,size)) ? 1 : 0);
+
+					i += 1;
+					// if all bits set, continue to next element
+					if (i % 64 == 0) { out++; }
+				}
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)), size);
+				}
+			}
+		}
+
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+		float scale;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+
 	/* W. S. Fife and J. K. Archibald,
 	   "Improved Census Transforms for Resource-Optimized Stereo Vision,"
 	  in IEEE Transactions on Circuits and Systems for Video Technology,
@@ -224,6 +286,39 @@ void CensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void ExpandingCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+	parallel2D<algorithms::ExCensusTransformRowMajor<9,7>>({l.data(), ct_l_.data(),1.5f}, l.width, l.height);
+	parallel2D<algorithms::ExCensusTransformRowMajor<9,7>>({r.data(), ct_r_.data(),1.5f}, r.width, r.height);
+}
+
+/*void ExpandingCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr) {
+	parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({l.data(), hl.data(), ct_l_.data(), {ushort(hl.width), ushort(hl.height)}}, l.width, l.height);
+	parallel2D<algorithms::HCensusTransformRowMajor<9,7>>({r.data(), hr.data(), ct_r_.data(), {ushort(hr.width), ushort(hr.height)}}, r.width, r.height);
+}*/
+
+void ExpandingCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 void MiniCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
 	if (pattern_ == CensusPattern::STANDARD) {
 		parallel2D<algorithms::CensusTransformRowMajor<5,3>>({l.data(), ct_l_.data()}, l.width, l.height);
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
index 3b05156a4565b2dd6979803f749d9d403bbc4158..af569b69d3f47fc1c443061c846c6d4bf1e7e204 100644
--- a/lib/libstereo/src/costs/census.hpp
+++ b/lib/libstereo/src/costs/census.hpp
@@ -203,6 +203,33 @@ protected:
 	CensusPattern pattern_ = CensusPattern::STANDARD;
 };
 
+class ExpandingCensusMatchingCost : public DSBase<impl::CensusMatchingCost<9,7,1>> {
+public:
+	typedef impl::CensusMatchingCost<9,7,1> DataType;
+	typedef unsigned short Type;
+
+	ExpandingCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	ExpandingCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
+		{
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
+		}
+
+	inline void setPattern(CensusPattern p) { pattern_ = p; }
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	//void set(const Array2D<uchar> &l, const Array2D<uchar> &r, const Array2D<uchar> &hl, const Array2D<uchar> &hr);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	CensusPattern pattern_ = CensusPattern::STANDARD;
+};
+
 class WeightedCensusMatchingCost : public DSBase<impl::WeightedCensusMatchingCost<11, 5>> {
 public:
 	typedef impl::WeightedCensusMatchingCost<11, 5> DataType;