diff --git a/lib/libstereo/src/aggregations/standard_sgm.hpp b/lib/libstereo/src/aggregations/standard_sgm.hpp
index e05d06f2e793506fb5e72aa7b35f1f11550db381..2830351bcf327efda973a854029f63cec5c59bb4 100644
--- a/lib/libstereo/src/aggregations/standard_sgm.hpp
+++ b/lib/libstereo/src/aggregations/standard_sgm.hpp
@@ -2,6 +2,7 @@
 #define _FTL_LIBSTEREO_AGGREGATIONS_STANDARD_HPP_
 
 #include "../dsi.hpp"
+#include "../array2d.hpp"
 
 namespace ftl {
 namespace stereo {
@@ -14,6 +15,8 @@ struct StandardSGM {
 
 	// Provided externally
 	const DSIIN in;
+	typename Array2D<costtype_t>::Data min_cost_all;
+
 	const int P1;
 	const int P2;
 
@@ -102,9 +105,12 @@ struct StandardSGM {
 		// Each thread then obtains thread global minimum
 		#ifdef __CUDA_ARCH__
 		min_cost = warpMin(min_cost);
+		#else
+		// add assert
 		#endif
 
 		data.previous_cost_min = min_cost;
+		min_cost_all(pixel.y,pixel.x) += min_cost; // atomic?
 
 		// Swap current and previous cost buffers
 		costtype_t *tmp_ptr = const_cast<costtype_t *>(data.previous);
diff --git a/lib/libstereo/src/stereo_adcensussgm.cu b/lib/libstereo/src/stereo_adcensussgm.cu
index 3baecec535598f673b20c5c201934f52732638d5..ddb33d34d0d710755b64aa812ef45daf5cf22944 100644
--- a/lib/libstereo/src/stereo_adcensussgm.cu
+++ b/lib/libstereo/src/stereo_adcensussgm.cu
@@ -47,9 +47,6 @@ using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::StandardSGM;
 
-static int ct_windows_w = 9;
-static int ct_windows_h = 7;
-
 struct StereoADCensusSgm::Impl {
 	DisparitySpaceImage<unsigned short> dsi;
 	AbsDiffBT ad_cost;
@@ -108,7 +105,7 @@ void StereoADCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputAr
 	if (params.debug) { timer_print("census transform"); }
 
 	// cost aggregation
-	StandardSGM<DualCosts<AbsDiffBT,CensusMatchingCost>::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	StandardSGM<DualCosts<AbsDiffBT,CensusMatchingCost>::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out = impl_->aggr(func, params.paths);
 
 	cudaSafeCall(cudaDeviceSynchronize());
diff --git a/lib/libstereo/src/stereo_adsgm.cu b/lib/libstereo/src/stereo_adsgm.cu
index a79abcf346104a2ec3f5e01ef737d1aaa043a3c6..736ec2a4b95024955748d4d967308a8712acf24c 100644
--- a/lib/libstereo/src/stereo_adsgm.cu
+++ b/lib/libstereo/src/stereo_adsgm.cu
@@ -45,9 +45,6 @@ using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::StandardSGM;
 
-static int ct_windows_w = 9;
-static int ct_windows_h = 7;
-
 struct StereoADSgm::Impl {
 	DisparitySpaceImage<unsigned short> dsi;
 	AbsDiffBT cost;
@@ -99,7 +96,7 @@ void StereoADSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray di
 	if (params.debug) { timer_print("census transform"); }
 
 	// cost aggregation
-	StandardSGM<AbsDiffBT::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	StandardSGM<AbsDiffBT::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out = impl_->aggr(func, params.paths);
 
 	cudaSafeCall(cudaDeviceSynchronize());
diff --git a/lib/libstereo/src/stereo_censussgm.cu b/lib/libstereo/src/stereo_censussgm.cu
index 3712a87fa7c267b23cc22c997d749be5f67f7065..be11a5dc0503f21cc3164e03c38f1868a8f90f93 100644
--- a/lib/libstereo/src/stereo_censussgm.cu
+++ b/lib/libstereo/src/stereo_censussgm.cu
@@ -1,6 +1,8 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
+
 #include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
 
 #include "stereo.hpp"
 
@@ -47,15 +49,11 @@ using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::StandardSGM;
 
-static int ct_windows_w = 9;
-static int ct_windows_h = 7;
-
 struct StereoCensusSgm::Impl {
 	//DisparitySpaceImage<unsigned short> dsi;
 	CensusMatchingCost cost;
 	Array2D<unsigned short> cost_min;
 	Array2D<unsigned short> cost_min_paths;
-	Array2D<unsigned short> uncertainty;
 	Array2D<float> confidence;
 	Array2D<float> disparity_r;
 	Array2D<uchar> l;
@@ -68,7 +66,6 @@ struct StereoCensusSgm::Impl {
 		cost(width, height, min_disp, max_disp),
 		cost_min(width, height),
 		cost_min_paths(width, height),
-		uncertainty(width, height),
 		confidence(width, height),
 		disparity_r(width, height), l(width, height), r(width, height) {}
 
@@ -86,9 +83,6 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
 		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
 	}
 
-	//impl_->dsi.clear();
-	impl_->uncertainty.toMat().setTo(0);
-
 	mat2gray(l, impl_->l);
 	mat2gray(r, impl_->r);
 	timer_set();
@@ -100,7 +94,7 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
 	if (params.debug) { timer_print("census transform"); }
 
 	// cost aggregation
-	StandardSGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	StandardSGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out = impl_->aggr(func, params.paths);
 
 	cudaSafeCall(cudaDeviceSynchronize());
@@ -120,11 +114,14 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
 	// message passing. Lecture Notes in Computer Science (Including Subseries
 	// Lecture Notes in Artificial Intelligence and Lecture Notes in
 	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
-	//cv::Mat uncertainty;
-	//uncertainty = impl_->cost_min.toMat() - impl_->cost_min_paths.toMat();
-	// confidence threshold
-	// TODO: estimate confidence from uncertainty and plot ROC curve.
-	//disparity.setTo(0.0f, uncertainty > params.uniqueness);
+
+	if (disparity.isGpuMat()) {
+		// TODO: extract cost_min in WTA
+		cv::cuda::GpuMat uncertainty;
+		cv::cuda::subtract(impl_->cost_min.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+		cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+		disparity.getGpuMatRef().setTo(0, uncertainty);
+	}
 }
 
 StereoCensusSgm::~StereoCensusSgm() {
diff --git a/lib/libstereo/src/stereo_gradientstree.cu b/lib/libstereo/src/stereo_gradientstree.cu
index 74ec61ed8b21628c6297c1420fe3fd8cb288a7cc..272d6f4b2dee0d38e3f751aaa593c8057fdf2120 100644
--- a/lib/libstereo/src/stereo_gradientstree.cu
+++ b/lib/libstereo/src/stereo_gradientstree.cu
@@ -45,9 +45,6 @@ using cv::Mat;
 using cv::Size;
 using ftl::stereo::aggregations::StandardSGM;
 
-static int ct_windows_w = 9;
-static int ct_windows_h = 7;
-
 struct StereoGradientStree::Impl {
 	GradientMatchingCostL2 cost;
 	Array2D<unsigned short> cost_min;
@@ -95,13 +92,13 @@ void StereoGradientStree::compute(cv::InputArray l, cv::InputArray r, cv::Output
 	cudaSafeCall(cudaDeviceSynchronize());
 
 	//AggregationParameters aggr_params = {impl_->cost_min_paths.data(), params};
-	StandardSGM<GradientMatchingCostL2::DataType> func1 = {impl_->cost.data(), params.P1, params.P2};
+	StandardSGM<GradientMatchingCostL2::DataType> func1 = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out1 = impl_->aggr1(func1, AggregationDirections::HORIZONTAL);
 
 	cudaSafeCall(cudaDeviceSynchronize());
 	if (params.debug) { timer_print("Aggregation 1"); }
 
-	StandardSGM<DisparitySpaceImage<unsigned short>::DataType> func2 = { out1.data(), params.P1, params.P2};
+	StandardSGM<DisparitySpaceImage<unsigned short>::DataType> func2 = {out1.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out2 = impl_->aggr2(func2, AggregationDirections::VERTICAL);
 
 	cudaSafeCall(cudaDeviceSynchronize());
diff --git a/lib/libstereo/src/stereo_misgm.cu b/lib/libstereo/src/stereo_misgm.cu
index 6c54cce0caa8195551577f3db7233cb9b25ddc60..ccc6005d9de566b1e1e19dacfe73042274258d01 100644
--- a/lib/libstereo/src/stereo_misgm.cu
+++ b/lib/libstereo/src/stereo_misgm.cu
@@ -122,7 +122,7 @@ void StereoMiSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray di
 	cudaSafeCall(cudaDeviceSynchronize());
 	// cost aggregation
 	//AggregationParameters aggr_params = {impl_->cost_min_paths.data(), params};
-	StandardSGM<MutualInformationMatchingCost::DataType> func = {impl_->cost.data(), params.P1, params.P2};
+	StandardSGM<MutualInformationMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
 	auto &out = impl_->aggr(func, AggregationDirections::ALL);  // params.paths
 
 	cudaSafeCall(cudaDeviceSynchronize());