From c6c54391883ad044c50601e345a88892e2856cca Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Tue, 21 Apr 2020 11:16:24 +0300
Subject: [PATCH] Add an adaptive penalty path aggregation

---
 lib/libstereo/CMakeLists.txt                  |   5 +
 lib/libstereo/include/stereo.hpp              |  26 +++
 lib/libstereo/include/stereo_types.hpp        |   2 +-
 lib/libstereo/middlebury/main.cpp             |  17 +-
 .../src/aggregations/adaptive_penalty.hpp     | 129 +++++++++++++++
 lib/libstereo/src/array2d.hpp                 |   8 +
 lib/libstereo/src/cost_aggregation.hpp        |  46 ++++--
 lib/libstereo/src/costs/absolute_diff.hpp     |   2 +
 lib/libstereo/src/costs/census.hpp            |   2 +
 lib/libstereo/src/costs/dual.hpp              |   2 +-
 lib/libstereo/src/costs/gradient.hpp          |   2 +
 .../src/costs/mutual_information.hpp          |   2 +
 lib/libstereo/src/dsbase.hpp                  |   2 +-
 lib/libstereo/src/stereo_census_adaptive.cu   | 154 ++++++++++++++++++
 lib/libstereo/src/stereo_gradientstree.cu     |   4 +-
 15 files changed, 376 insertions(+), 27 deletions(-)
 create mode 100644 lib/libstereo/src/aggregations/adaptive_penalty.hpp
 create mode 100644 lib/libstereo/src/stereo_census_adaptive.cu

diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index f781444a5..175a3c670 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -5,6 +5,7 @@ project(libstereo)
 option(BUILD_MIDDLEBURY     "Build Middlebury evaluation" OFF)
 option(BUILD_TESTS          "Build unit tests" ON)
 option(LIBSTEREO_SHARED     "Build a shared library" OFF)
+option(USE_GPU              "Use the GPU instead of CPU" ON)
 
 find_package(OpenCV REQUIRED)
 find_package(OpenMP REQUIRED)
@@ -21,6 +22,9 @@ set(CMAKE_CXX_FLAGS_RELEASE)
 if (CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CUDA_FLAGS "--gpu-architecture=compute_61 -std=c++14 -Xcompiler -fPIC -Xcompiler ${OpenMP_CXX_FLAGS}")
     set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
+else()
+    set(CMAKE_CUDA_FLAGS "--gpu-architecture=compute_61 -std=c++14")
+    set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@@ -50,6 +54,7 @@ else()
                 src/stereo_misgm.cu
                 src/stereo_misgm2.cu
                 src/stereo_censussgm.cu
+                src/stereo_census_adaptive.cu
                 src/stereo_adcensussgm.cu
                 src/stereo_adsgm.cu
                 #src/stereo_sgmp.cpp
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index e22be8822..d210d3688 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -84,6 +84,32 @@ private:
 	Impl *impl_;
 };
 
+class StereoCensusAdaptive {
+public:
+	StereoCensusAdaptive();
+	~StereoCensusAdaptive();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
 class StereoMiSgm {
 public:
 	StereoMiSgm();
diff --git a/lib/libstereo/include/stereo_types.hpp b/lib/libstereo/include/stereo_types.hpp
index c4fa54de4..fc57ddcf9 100644
--- a/lib/libstereo/include/stereo_types.hpp
+++ b/lib/libstereo/include/stereo_types.hpp
@@ -1,7 +1,7 @@
 #ifndef _FTL_LIBSTEREO_TYPES_HPP_
 #define _FTL_LIBSTEREO_TYPES_HPP_
 
-enum AggregationDirections {
+enum AggregationDirections : int {
 	LEFTRIGHT = 1,
 	RIGHTLEFT = 2,
 	HORIZONTAL = 1+2,
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
index 66f407e53..1c9409cb5 100644
--- a/lib/libstereo/middlebury/main.cpp
+++ b/lib/libstereo/middlebury/main.cpp
@@ -6,6 +6,8 @@
 #include "middlebury.hpp"
 #include "stereo.hpp"
 
+#include "../../components/common/cpp/include/ftl/config.h"
+
 /**
  * @param   disp    disparity map
  * @param   out     output parameter
@@ -21,8 +23,11 @@ void colorize(const cv::Mat &disp, cv::Mat &out, int ndisp=-1) {
 	disp.convertTo(dispf, CV_32FC1);
 	dispf.convertTo(dispc, CV_8UC1, (1.0f / (ndisp > 0 ? (float) ndisp : dmax)) * 255.0f);
 
+	#if OPENCV_VERSION >= 40102
 	cv::applyColorMap(dispc, out, cv::COLORMAP_TURBO);
-	//cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+	#else
+	cv::applyColorMap(dispc, out, cv::COLORMAP_INFERNO);
+	#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -62,18 +67,18 @@ int main(int argc, char* argv[]) {
 
 	int ndisp = calib.vmax - calib.vmin;
 
-	auto stereo = StereoCensusSgm();
-	stereo.params.P1 = 4;
-	stereo.params.P2 = 25;
+	auto stereo = StereoCensusAdaptive();
+	stereo.params.P1 = 8;
+	//stereo.params.P2 = 25;
 
 	stereo.params.d_min = calib.vmin;
 	stereo.params.d_max = calib.vmax;
 	stereo.params.subpixel = 1;
 	stereo.params.debug = true;
 	//stereo.params.paths = AggregationDirections::ALL;
-	stereo.params.uniqueness = 40;
+	stereo.params.uniqueness = 80;
 
-	int i_max = 5;
+	int i_max = 1;
 	float t = 4.0f;
 
 	if (imL.empty() || imR.empty() || gtL.empty()) {
diff --git a/lib/libstereo/src/aggregations/adaptive_penalty.hpp b/lib/libstereo/src/aggregations/adaptive_penalty.hpp
new file mode 100644
index 000000000..a978d8e36
--- /dev/null
+++ b/lib/libstereo/src/aggregations/adaptive_penalty.hpp
@@ -0,0 +1,129 @@
+#ifndef _FTL_LIBSTEREO_AGGREGATIONS_ADAPTIVE_HPP_
+#define _FTL_LIBSTEREO_AGGREGATIONS_ADAPTIVE_HPP_
+
+#include "../dsi.hpp"
+#include "../array2d.hpp"
+
+namespace ftl {
+namespace stereo {
+namespace aggregations {
+
+template <typename DSIIN>
+struct AdaptivePenaltySGM {
+	typedef typename DSIIN::Type Type;
+	typedef typename DSIIN::Type costtype_t;
+
+	// Provided externally
+	const DSIIN in;
+	typename Array2D<costtype_t>::Data min_cost_all;
+
+	const int P1;
+
+	// Provided internally
+	typename DisparitySpaceImage<costtype_t>::DataType out;
+	typename DisparitySpaceImage<costtype_t>::DataType previous;
+	typename DisparitySpaceImage<costtype_t>::DataType updates;
+	typename Array2D<uint8_t>::Data penalties;
+
+	struct PathData : BasePathData<costtype_t> {
+		// Custom path data goes here...
+		costtype_t previous_cost_min;
+		costtype_t *previous;
+		costtype_t *current;
+	};
+
+	struct DirectionData {
+		DisparitySpaceImage<costtype_t> previous;
+		DisparitySpaceImage<costtype_t> updates;
+		Array2D<uint8_t> penalties;
+	};
+
+	/* Initialise buffers for a new path direction. */
+	void direction(DirectionData &data, DisparitySpaceImage<costtype_t> &buffer) {
+		out = buffer.data();
+		data.previous.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		data.updates.create(out.width+out.height, 1, out.disp_min, out.disp_max);
+		previous = data.previous.data();
+		updates = data.updates.data();
+		data.penalties.create(out.width, out.height);  // Note: should already exist
+		penalties = data.penalties.data();
+	}
+
+	/* Reset buffers to start a new path */
+	__host__ __device__ inline void startPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		data.previous = &previous(0,data.pathix,previous.disp_min);
+		data.current = &updates(0,data.pathix,updates.disp_min);
+
+		for (int d=thread; d<=previous.disp_min; d+=stride) {
+			data.previous[d] = 0;
+		}
+
+		// To ensure all threads have finished clearing the buffer
+		#ifdef __CUDA_ARCH__
+		__syncwarp();
+		#endif
+	}
+
+	__host__ __device__ inline void endPath(ushort2 pixel, ushort thread, ushort stride, PathData &data) {}
+
+	/* Main SGM cost aggregation function */
+	__host__ __device__ inline costtype_t calculateCost(ushort2 pixel, int d, costtype_t *previous, int size, costtype_t previous_cost_min) {
+		const costtype_t L_min =
+			min(previous[d],
+				min(costtype_t(previous_cost_min + penalties(pixel.y, pixel.x)),
+						min(costtype_t(previous[min(d+1,size)] + P1),
+									costtype_t(previous[max(d-1,0)] + P1))
+			)
+		);
+
+		// Note: This clamping to min 0 does make a tiny difference
+		auto cost = L_min + in(pixel.y,pixel.x,d+in.disp_min);
+		return (cost > previous_cost_min) ? cost - previous_cost_min : 0;
+	}
+
+	/* Stride over each disparity and calculate minimum cost */
+	__host__ __device__ inline void operator()(ushort2 pixel, ushort thread, ushort stride, PathData &data) {
+		#ifndef __CUDA_ARCH__
+		using std::min;
+		using std::max;
+		#endif
+
+		const int d_stop = int(out.disp_max)-int(out.disp_min);
+
+		costtype_t min_cost = 255;
+
+		// For each disparity (striding the threads)
+		for (int d=thread; d<=d_stop; d+=stride) {
+			auto c = calculateCost(pixel, d, data.previous, d_stop, data.previous_cost_min);
+
+			out(pixel.y,pixel.x,d+in.disp_min) += c;
+			data.current[d] = c;
+
+			// Record the smallest disparity cost for this pixel
+			min_cost = (c < min_cost) ? c : min_cost;
+		}
+
+		// WARP Aggregate on GPU only (assumes stride = 1 on CPU)
+		// Min cost from each thread must be combined for overall minimum
+		// Each thread then obtains thread global minimum
+		#ifdef __CUDA_ARCH__
+		min_cost = warpMin(min_cost);
+		#else
+		// add assert
+		#endif
+
+		data.previous_cost_min = min_cost;
+		min_cost_all(pixel.y,pixel.x) += min_cost; // atomic?
+
+		// Swap current and previous cost buffers
+		costtype_t *tmp_ptr = const_cast<costtype_t *>(data.previous);
+		data.previous = data.current;
+		data.current = tmp_ptr;
+	}
+};
+
+}
+}
+}
+
+#endif
diff --git a/lib/libstereo/src/array2d.hpp b/lib/libstereo/src/array2d.hpp
index f8385ca2a..1dbb8a034 100644
--- a/lib/libstereo/src/array2d.hpp
+++ b/lib/libstereo/src/array2d.hpp
@@ -48,6 +48,14 @@ public:
 		if (needs_free_ && data_.data) freeMemory(data_.data);
 	}
 
+	Array2D<T> &operator=(const Array2D<T> &c) {
+		data_ = c.data_;
+		width = c.width;
+		height = c.height;
+		needs_free_ = false;
+		return *this;
+	}
+
 	struct Data {
 		__host__ __device__ inline T& operator() (const int y, const int x) {
 			return data[x + y*pitch];
diff --git a/lib/libstereo/src/cost_aggregation.hpp b/lib/libstereo/src/cost_aggregation.hpp
index d59613a9e..5f3d9dec3 100644
--- a/lib/libstereo/src/cost_aggregation.hpp
+++ b/lib/libstereo/src/cost_aggregation.hpp
@@ -83,22 +83,6 @@ template <typename F>
 class PathAggregator {
 	public:
 
-	void queuePath(int id, F &f) {
-		F f1 = f;
-		f1.direction(path_data[id], out);
-
-		switch (id) {
-		case 0	:	parallel1DWarp<algorithms::Aggregator<F, 1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
-		case 1	:	parallel1DWarp<algorithms::Aggregator<F,-1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
-		case 2	:	parallel1DWarp<algorithms::Aggregator<F, 0, 1>>({f1}, f.in.width, f.in.disparityRange()); break;
-		case 3	:	parallel1DWarp<algorithms::Aggregator<F, 0,-1>>({f1}, f.in.width, f.in.disparityRange()); break;
-		case 4	:	parallel1DWarp<algorithms::Aggregator<F, 1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
-		case 5	:	parallel1DWarp<algorithms::Aggregator<F,-1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
-		case 6	:	parallel1DWarp<algorithms::Aggregator<F, 1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
-		case 7	:	parallel1DWarp<algorithms::Aggregator<F,-1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
-		}
-	}
-
 	DisparitySpaceImage<typename F::Type> &operator()(F &f, int paths) {
 		out.create(f.in.width, f.in.height, f.in.disp_min, f.in.disp_max);
 		out.clear();
@@ -113,8 +97,38 @@ class PathAggregator {
 		return out;
 	}
 
+	typename F::DirectionData &getDirectionData(AggregationDirections d) {
+		switch (d) {
+		case AggregationDirections::LEFTRIGHT			: return path_data[0];
+		case AggregationDirections::RIGHTLEFT			: return path_data[1];
+		case AggregationDirections::UPDOWN				: return path_data[2];
+		case AggregationDirections::DOWNUP				: return path_data[3];
+		case AggregationDirections::TOPLEFTBOTTOMRIGHT	: return path_data[4];
+		case AggregationDirections::BOTTOMRIGHTTOPLEFT	: return path_data[5];
+		case AggregationDirections::BOTTOMLEFTTOPRIGHT	: return path_data[6];
+		case AggregationDirections::TOPRIGHTBOTTOMLEFT	: return path_data[7];
+		default: throw std::exception();
+		}
+	}
+
 	private:
 	DisparitySpaceImage<typename F::Type> out;
 	typename F::DirectionData path_data[8];
 	DisparitySpaceImage<typename F::Type> buffers[8];
+
+	void queuePath(int id, F &f) {
+		F f1 = f;
+		f1.direction(path_data[id], out);
+
+		switch (id) {
+		case 0	:	parallel1DWarp<algorithms::Aggregator<F, 1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
+		case 1	:	parallel1DWarp<algorithms::Aggregator<F,-1, 0>>({f1}, f.in.height, f.in.disparityRange()); break;
+		case 2	:	parallel1DWarp<algorithms::Aggregator<F, 0, 1>>({f1}, f.in.width, f.in.disparityRange()); break;
+		case 3	:	parallel1DWarp<algorithms::Aggregator<F, 0,-1>>({f1}, f.in.width, f.in.disparityRange()); break;
+		case 4	:	parallel1DWarp<algorithms::Aggregator<F, 1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 5	:	parallel1DWarp<algorithms::Aggregator<F,-1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 6	:	parallel1DWarp<algorithms::Aggregator<F, 1,-1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		case 7	:	parallel1DWarp<algorithms::Aggregator<F,-1, 1>>({f1}, f.in.height+f.in.width, f.in.disparityRange()); break;
+		}
+	}
 };
diff --git a/lib/libstereo/src/costs/absolute_diff.hpp b/lib/libstereo/src/costs/absolute_diff.hpp
index 460eb42e5..d60394e70 100644
--- a/lib/libstereo/src/costs/absolute_diff.hpp
+++ b/lib/libstereo/src/costs/absolute_diff.hpp
@@ -25,6 +25,8 @@ namespace impl {
 			return ((r_val > min(l_val_n,l_val) && r_val < max(l_val_n,l_val)) || (r_val > min(l_val_p,l_val) && r_val < max(l_val_p,l_val))) ? 0 : abs(l_val-r_val);
 		}
 
+		static constexpr unsigned short COST_MAX = 255;
+
 		Array2D<uchar>::Data l;
 		Array2D<uchar>::Data r;
 	};
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
index c8467b491..fc7be10d3 100644
--- a/lib/libstereo/src/costs/census.hpp
+++ b/lib/libstereo/src/costs/census.hpp
@@ -28,6 +28,8 @@ namespace impl {
 			#endif
 		}
 
+		static constexpr unsigned short COST_MAX = 64;
+
 		Array2D<uint64_t>::Data ct_l;
 		Array2D<uint64_t>::Data ct_r;
 	};
diff --git a/lib/libstereo/src/costs/dual.hpp b/lib/libstereo/src/costs/dual.hpp
index 3cbeb7eeb..04d0831d2 100644
--- a/lib/libstereo/src/costs/dual.hpp
+++ b/lib/libstereo/src/costs/dual.hpp
@@ -90,7 +90,7 @@ public:
 		this->data().weights_r = weights_r.data();
 	}
 
-	static constexpr short COST_MAX = A::COST_MAX + B::COST_MAX;
+	static constexpr short COST_MAX = A::DataType::COST_MAX + B::DataType::COST_MAX;
 
 protected:
 	A &cost_a;
diff --git a/lib/libstereo/src/costs/gradient.hpp b/lib/libstereo/src/costs/gradient.hpp
index 4e6ea23d8..46b3e1b87 100644
--- a/lib/libstereo/src/costs/gradient.hpp
+++ b/lib/libstereo/src/costs/gradient.hpp
@@ -25,6 +25,8 @@ namespace impl {
 			#endif
 		}
 
+		static constexpr unsigned short COST_MAX = 255;
+
 		Array2D<short>::Data l_dx;
 		Array2D<short>::Data r_dx;
 		Array2D<short>::Data l_dy;
diff --git a/lib/libstereo/src/costs/mutual_information.hpp b/lib/libstereo/src/costs/mutual_information.hpp
index 722692ce5..05f526f0a 100644
--- a/lib/libstereo/src/costs/mutual_information.hpp
+++ b/lib/libstereo/src/costs/mutual_information.hpp
@@ -23,6 +23,8 @@ namespace impl {
 			return -(H1+H2-H12);
 		}
 
+		static constexpr unsigned short COST_MAX = 255;
+
 		Array2D<unsigned char>::Data l;
 		Array2D<unsigned char>::Data r;
 		Array2D<float>::Data h1;
diff --git a/lib/libstereo/src/dsbase.hpp b/lib/libstereo/src/dsbase.hpp
index d5f99241d..d2f6fc463 100644
--- a/lib/libstereo/src/dsbase.hpp
+++ b/lib/libstereo/src/dsbase.hpp
@@ -18,7 +18,7 @@ namespace impl {
 		uint16_t disp_min;
 		uint16_t disp_max;
 
-		static constexpr T COST_MAX = std::numeric_limits<T>::max();
+		//static constexpr T COST_MAX = std::numeric_limits<T>::max();
 
 		__host__ __device__ inline uint16_t disparityRange() const { return disp_max-disp_min+1; }
 		__host__ __device__ inline uint32_t size() const { return width * disparityRange() * height; }
diff --git a/lib/libstereo/src/stereo_census_adaptive.cu b/lib/libstereo/src/stereo_census_adaptive.cu
new file mode 100644
index 000000000..d0c6f105c
--- /dev/null
+++ b/lib/libstereo/src/stereo_census_adaptive.cu
@@ -0,0 +1,154 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <opencv2/highgui.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/census.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/adaptive_penalty.hpp"
+
+#include "median_filter.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::AdaptivePenaltySGM;
+
+struct StereoCensusAdaptive::Impl {
+	//DisparitySpaceImage<unsigned short> dsi;
+	CensusMatchingCost cost;
+	Array2D<unsigned short> cost_min_paths;
+	Array2D<unsigned short> uncertainty;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+    Array2D<uchar> r;
+    Array2D<uchar> penalty;
+
+	PathAggregator<AdaptivePenaltySGM<CensusMatchingCost::DataType>> aggr;
+	WinnerTakesAll<DSImage16U,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+        disparity_r(width, height), l(width, height), r(width, height),
+        penalty(width, height) {}
+
+};
+
+StereoCensusAdaptive::StereoCensusAdaptive() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoCensusAdaptive::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	timer_set();
+
+	// CT
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+    if (params.debug) { timer_print("census transform"); }
+    
+    impl_->penalty.toGpuMat().setTo(params.P1*4);
+    auto canny = cv::cuda::createCannyEdgeDetector(30,120);
+    cv::cuda::GpuMat edges;
+    canny->detect(impl_->l.toGpuMat(), edges);
+    impl_->penalty.toGpuMat().setTo(int(float(params.P1)*1.5f), edges);
+
+    cv::Mat penalties;
+    impl_->penalty.toGpuMat().download(penalties);
+    cv::imshow("Penalties", penalties);
+
+	// cost aggregation
+	AdaptivePenaltySGM<CensusMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1};
+	impl_->aggr.getDirectionData(AggregationDirections::LEFTRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::RIGHTLEFT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::UPDOWN).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::DOWNUP).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::TOPLEFTBOTTOMRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::BOTTOMRIGHTTOPLEFT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::BOTTOMLEFTTOPRIGHT).penalties = impl_->penalty;
+	impl_->aggr.getDirectionData(AggregationDirections::TOPRIGHTBOTTOMLEFT).penalties = impl_->penalty;
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+
+	if (disparity.isGpuMat()) {
+		auto uncertainty = impl_->uncertainty.toGpuMat();
+		cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+		cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+		impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	}
+	else {
+		auto uncertainty = impl_->uncertainty.toMat();
+		cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
+		cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+		impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	}
+
+	median_filter(impl_->wta.disparity, disparity);
+	if (params.debug) { timer_print("median filter"); }
+}
+
+StereoCensusAdaptive::~StereoCensusAdaptive() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/stereo_gradientstree.cu b/lib/libstereo/src/stereo_gradientstree.cu
index 272d6f4b2..1312a13e4 100644
--- a/lib/libstereo/src/stereo_gradientstree.cu
+++ b/lib/libstereo/src/stereo_gradientstree.cu
@@ -92,13 +92,13 @@ void StereoGradientStree::compute(cv::InputArray l, cv::InputArray r, cv::Output
 	cudaSafeCall(cudaDeviceSynchronize());
 
 	//AggregationParameters aggr_params = {impl_->cost_min_paths.data(), params};
-	StandardSGM<GradientMatchingCostL2::DataType> func1 = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
+	StandardSGM<GradientMatchingCostL2::DataType> func1 = {impl_->cost.data(), impl_->cost_min_paths.data(), int(params.P1), int(params.P2)};
 	auto &out1 = impl_->aggr1(func1, AggregationDirections::HORIZONTAL);
 
 	cudaSafeCall(cudaDeviceSynchronize());
 	if (params.debug) { timer_print("Aggregation 1"); }
 
-	StandardSGM<DisparitySpaceImage<unsigned short>::DataType> func2 = {out1.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
+	StandardSGM<DisparitySpaceImage<unsigned short>::DataType> func2 = {out1.data(), impl_->cost_min_paths.data(), int(params.P1), int(params.P2)};
 	auto &out2 = impl_->aggr2(func2, AggregationDirections::VERTICAL);
 
 	cudaSafeCall(cudaDeviceSynchronize());
-- 
GitLab