diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index bdd9a91cc2e247929b09a95206db659615d643a2..f781444a5d80024579d2fc17c699df157d6c20a9 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -33,6 +33,7 @@ if (LIBSTEREO_SHARED)
     add_library(libstereo SHARED
                 src/stereo_gradientstree.cu
                 src/stereo_misgm.cu
+                src/stereo_misgm2.cu
                 src/stereo_censussgm.cu
                 src/stereo_sgmp.cpp
                 src/costs/census.cu
@@ -47,6 +48,7 @@ else()
     add_library(libstereo
                 src/stereo_gradientstree.cu
                 src/stereo_misgm.cu
+                src/stereo_misgm2.cu
                 src/stereo_censussgm.cu
                 src/stereo_adcensussgm.cu
                 src/stereo_adsgm.cu
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index 48d58f9a0ddc34bafb1c4fa5ddd799605aed11eb..e22be88228115e2627879527a4a82ef5d8cad0e1 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -9,6 +9,7 @@ public:
 	~StereoADCensusSgm();
 
 	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
 
 	struct Parameters {
 		int d_min = 0;
@@ -35,6 +36,7 @@ public:
 	~StereoADSgm();
 
 	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
 
 	struct Parameters {
 		int d_min = 0;
@@ -61,6 +63,7 @@ public:
 	~StereoCensusSgm();
 
 	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
 
 	struct Parameters {
 		int d_min = 0;
@@ -108,7 +111,13 @@ private:
 	Impl *impl_;
 };
 
-
+/**
+ * Work in progress
+ *
+ * Mutual Information and Census cost with SGM. Cost calculation uses intensity
+ * variance to weight MI and Census costs (low weight for census cost on
+ * low variance areas). Window size for local variance based on census mask size.
+ */
 class StereoMiSgm2 {
 public:
 	StereoMiSgm2();
@@ -127,9 +136,10 @@ public:
 		int paths = AggregationDirections::HORIZONTAL |
 					AggregationDirections::VERTICAL |
 					AggregationDirections::DIAGONAL;
+
+		float alpha = 0.2; /** alpha: minimum weight for census cost */
+		float beta = 0.8; /** 1-beta: minimum weight for MI cost */
 		bool debug = false;
-		float w1 = 1.0;
-		float w2 = 1.0;
 	};
 	Parameters params;
 
@@ -175,6 +185,7 @@ public:
 	~StereoGradientStree();
 
 	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
 
 	struct Parameters {
 		int d_min = 0;
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
index 6821cacef28e9f6a35126eae4601546f54a40a98..66f407e53069592a4141fdb059c26f1b0386c234 100644
--- a/lib/libstereo/middlebury/main.cpp
+++ b/lib/libstereo/middlebury/main.cpp
@@ -21,8 +21,8 @@ void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
 	disp.convertTo(dispf, CV_32FC1);
 	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
 
-	//cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
-	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+	cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
+	//cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -63,17 +63,17 @@ int main(int argc, char* argv[]) {
 	int ndisp = calib.vmax - calib.vmin;
 
 	auto stereo = StereoCensusSgm();
-	stereo.params.P1 = 7;
-	stereo.params.P2 = 60;
+	stereo.params.P1 = 4;
+	stereo.params.P2 = 25;
 
 	stereo.params.d_min = calib.vmin;
 	stereo.params.d_max = calib.vmax;
 	stereo.params.subpixel = 1;
 	stereo.params.debug = true;
 	//stereo.params.paths = AggregationDirections::ALL;
-	//stereo.params.uniqueness = 200;
+	stereo.params.uniqueness = 40;
 
-	int i_max = 1;
+	int i_max = 5;
 	float t = 4.0f;
 
 	if (imL.empty() || imR.empty() || gtL.empty()) {
@@ -87,8 +87,6 @@ int main(int argc, char* argv[]) {
 	}
 
 	Mat disp(imL.size(), CV_32FC1, cv::Scalar(0.0f));
-	//stereo.setPrior(gtL);
-
 	std::cout << "resolution: " << imL.cols << "x" << imL.rows << ", " << ndisp << " disparities [" << calib.vmin << "," << calib.vmax << "]\n";
 
 	std::vector<cv::Mat> disp_color;
@@ -102,9 +100,8 @@ int main(int argc, char* argv[]) {
 		cv::cuda::GpuMat gpu_disp(disp);
 		stereo.compute(gpu_imL, gpu_imR, gpu_disp);
 		gpu_disp.download(disp);
+		stereo.setPrior(disp);
 
-		//stereo.compute(imL, imR, disp);
-		//stereo.setPrior(disp);
 		auto stop = std::chrono::high_resolution_clock::now();
 		std::cout	<< "disparity complete in "
 					<< std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count()
@@ -153,7 +150,7 @@ int main(int argc, char* argv[]) {
 	Mat gt_color;
 	colorize(gtL, gt_color, calib.ndisp);
 	cv::imshow("gt", gt_color);
-	cv::waitKey();
+	while(cv::waitKey() != 27);
 
 	return 0;
 }
diff --git a/lib/libstereo/src/costs/dual.hpp b/lib/libstereo/src/costs/dual.hpp
index 07bc49de96ec46059df4924e58b3367d4f1c6547..3cbeb7eeb85d702762c309eaf16862dcd8888f4d 100644
--- a/lib/libstereo/src/costs/dual.hpp
+++ b/lib/libstereo/src/costs/dual.hpp
@@ -17,23 +17,40 @@ namespace impl {
 			cost_a(w,h,dmin,dmax), cost_b(w,h,dmin,dmax) {}
 
 		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
-			return cost_a(y,x,d)*wa + cost_b(y,x,d)*wb;
+			return cost_a(y,x,d) + cost_b(y,x,d);
 		}
 
 		A cost_a;
 		B cost_b;
-		float wa = 1.0;
-		float wb = 1.0;
+	};
+
+	template <typename A, typename B>
+	struct DualCostsWeighted : DSImplBase<typename A::Type> {
+		static_assert(std::is_same<typename A::Type, typename B::Type>::value, "Cost types must be the same");
+		typedef unsigned short Type;
+
+		DualCostsWeighted(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<unsigned short>({w,h,dmin,dmax}),
+			cost_a(w,h,dmin,dmax), cost_b(w,h,dmin,dmax) {}
+
+		__host__ __device__ inline unsigned short operator()(const int y, const int x, const int d) const {
+			const float w = weights_l(y,x);
+			return cost_a(y,x,d)*w + cost_b(y,x,d)*(1.0f - w);
+		}
+
+		A cost_a;
+		B cost_b;
+		Array2D<float>::Data weights_l;
+		Array2D<float>::Data weights_r;
 	};
 }
 
+
 template <typename A, typename B>
 class DualCosts : public DSBase<impl::DualCosts<typename A::DataType,typename B::DataType>> {
 public:
 	typedef impl::DualCosts<typename A::DataType,typename B::DataType> DataType;
 	typedef unsigned short Type;
 
-	//DualCosts() : DSBase<DataType>(0, 0, 0, 0) {};
 	DualCosts(int width, int height, int disp_min, int disp_max, A &a, B &b)
 		: DSBase<DataType>(width, height, disp_min, disp_max),
 			cost_a(a), cost_b(b) {};
@@ -43,9 +60,34 @@ public:
 		this->data().cost_b = cost_b.data();
 	}
 
-	void setWeights(float a, float b) {
-		this->data().wa = a;
-		this->data().wb = b;
+	static constexpr short COST_MAX = A::COST_MAX + B::COST_MAX;
+
+protected:
+	A &cost_a;
+	B &cost_b;
+};
+
+/**
+ * Combine two cost functions with weight. Weight in Array2D<float> same size
+ * as input. Weight values assumed in range [0,1] and cost is calculated as
+ *
+ *    a(y,x)*w(y,x) + b(y,x)*(1.0-w(y,x))
+ */
+template <typename A, typename B>
+class DualCostsWeighted : public DSBase<impl::DualCostsWeighted<typename A::DataType,typename B::DataType>> {
+public:
+	typedef impl::DualCostsWeighted<typename A::DataType,typename B::DataType> DataType;
+	typedef unsigned short Type;
+
+	DualCostsWeighted(int width, int height, int disp_min, int disp_max, A &a, B &b, Array2D<float> &weightsl, Array2D<float> &weightsr)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			cost_a(a), cost_b(b), weights_l(weightsl), weights_r(weightsr) {};
+
+	void set() {
+		this->data().cost_a = cost_a.data();
+		this->data().cost_b = cost_b.data();
+		this->data().weights_l = weights_l.data();
+		this->data().weights_r = weights_r.data();
 	}
 
 	static constexpr short COST_MAX = A::COST_MAX + B::COST_MAX;
@@ -53,6 +95,8 @@ public:
 protected:
 	A &cost_a;
 	B &cost_b;
+	Array2D<float> &weights_l;
+	Array2D<float> &weights_r;
 };
 
 #endif
diff --git a/lib/libstereo/src/stereo_misgm2.cu b/lib/libstereo/src/stereo_misgm2.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e23ab955b9c2733a25f2c65f9b8ee45ec8c7df6d
--- /dev/null
+++ b/lib/libstereo/src/stereo_misgm2.cu
@@ -0,0 +1,201 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/cudafilters.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/mutual_information.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+#include "median_filter.hpp"
+
+#include "costs/dual.hpp"
+#include "costs/census.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize=3) {
+	if (in.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+
+		if (in.type() != CV_32FC1) {
+			in.getGpuMat().convertTo(im, CV_32FC1);
+		}
+		else {
+			im = in.getGpuMat();
+		}
+		std::cout << im.type() << "\n";
+		std::cout << mean.type() << "\n";
+		cv::cuda::multiply(im, im, im2);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean);   // E[X]
+		filter->apply(im2, mean2); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef()); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+struct StereoMiSgm2::Impl {
+	CensusMatchingCost census;
+	MutualInformationMatchingCost mi;
+	Array2D<float> variance;
+	Array2D<float> variance_r;
+	DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost> cost;
+
+	Array2D<unsigned short> cost_min;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> confidence;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Mat prior; // used only to calculate MI
+
+	PathAggregator<StandardSGM<DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost>::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		census(width, height, min_disp, max_disp),
+		mi(width, height, min_disp, max_disp),
+		variance(width, height),
+		variance_r(width, height),
+		cost(width, height, min_disp, max_disp, census, mi, variance, variance_r),
+		cost_min(width, height),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		confidence(width, height),
+		disparity_r(width, height),
+		l(width, height), r(width, height) {}
+};
+
+StereoMiSgm2::StereoMiSgm2() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoMiSgm2::setPrior(cv::InputArray prior) {
+	if (prior.rows() != impl_->cost.height() || prior.cols() != impl_->cost.width()) {
+		return;
+	}
+	if (prior.isGpuMat()) {
+		prior.getGpuMat().download(impl_->prior);
+	}
+	else {
+		prior.getMat().copyTo(impl_->prior);
+	}
+}
+
+#include <opencv2/highgui.hpp>
+
+void StereoMiSgm2::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	if (impl_->prior.empty() || impl_->prior.size() != l.size()) {
+		// if prior disparity is missing, use random values from uniform
+		// distribution
+		impl_->prior.create(l.rows(), l.cols(), CV_32FC1);
+		cv::randu(impl_->prior, params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	timer_set();
+
+	cv::cuda::GpuMat var_l = impl_->variance.toGpuMat();
+	variance_mask(impl_->l.toGpuMat(), var_l, 9);
+	cv::cuda::GpuMat var_r = impl_->variance_r.toGpuMat();
+	variance_mask(impl_->r.toGpuMat(), var_r, 9);
+
+	cv::cuda::normalize(var_l, var_l, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+	cv::cuda::normalize(var_r, var_r, params.alpha, params.beta, cv::NORM_MINMAX, -1);
+
+	if (l.isMat()) {
+		impl_->mi.set(l.getMat(), r.getMat(), impl_->prior);
+	}
+	else if (l.isGpuMat()) {
+		Mat l_;
+		Mat r_;
+		l.getGpuMat().download(l_);
+		r.getGpuMat().download(r_);
+		impl_->mi.set(l_, r_, impl_->prior);
+	}
+	impl_->census.set( impl_->l, impl_->r);
+	impl_->cost.set();
+	if (params.debug) { timer_print("Matching cost"); }
+
+	cudaSafeCall(cudaDeviceSynchronize());
+
+	StandardSGM<DualCostsWeighted<CensusMatchingCost, MutualInformationMatchingCost>::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, AggregationDirections::ALL);  // params.paths
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	median_filter(impl_->wta.disparity, disparity);
+	if (params.debug) { timer_print("median filter"); }
+}
+
+StereoMiSgm2::~StereoMiSgm2() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}