diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index 2ccab32d659c3ccce9acc9c8b0c3f17f14948190..16e0648e93711f6dc4dfdcece8b6d0109d13dd14 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -174,6 +174,36 @@ private:
 	Impl *impl_;
 };
 
+/**
+ * SGM aggregation on ground truth cost
+ */
+class StereoGtSgm {
+public:
+	StereoGtSgm();
+	~StereoGtSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp);
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		float P1 = 0.1f;
+		float P2 = 1.0f;
+		float uniqueness = std::numeric_limits<float>::max();
+		int subpixel = 1; // subpixel interpolation method
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
 
 /**
  * Census + SGM + prior
diff --git a/lib/libstereo/middlebury/main.cpp b/lib/libstereo/middlebury/main.cpp
index 60dfee546f55e6cd2411227a61b5e5fffeefee85..1d03ebb146b7afcd4f6d68082b6b989425915db2 100644
--- a/lib/libstereo/middlebury/main.cpp
+++ b/lib/libstereo/middlebury/main.cpp
@@ -73,9 +73,23 @@ static void run_misgm(MiddleburyData &data, cv::Mat &disparity) {
 	stereo.compute(data.imL, data.imR, disparity);
 }
 
+
+static void run_gtsgm(MiddleburyData &data, cv::Mat &disparity) {
+	auto stereo = StereoGtSgm();
+	stereo.params.P1 = 0.1;
+	stereo.params.P2 = 1.0;
+
+	stereo.params.d_min = data.calib.vmin;
+	stereo.params.d_max = data.calib.vmax;
+	stereo.params.subpixel = 1;
+	stereo.params.debug = true;
+	stereo.setPrior(data.gtL);
+	stereo.compute(data.imL, data.imR, disparity);
+}
+
 static const std::map<std::string, std::function<void(MiddleburyData&, cv::Mat&)>> algorithms = {
 	{ "censussgm", run_censussgm },
-	{ "misgm", run_misgm }
+	{ "misgm", run_misgm },
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/libstereo/src/costs/gt.hpp b/lib/libstereo/src/costs/gt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0d531475d937fb90d4b64d5e5fd129a0c93754f
--- /dev/null
+++ b/lib/libstereo/src/costs/gt.hpp
@@ -0,0 +1,68 @@
+#ifndef _FTL_LIBSTEREO_COSTS_GT_HPP_
+#define _FTL_LIBSTEREO_COSTS_GT_HPP_
+
+#include <opencv2/core/core.hpp>
+#include "array2d.hpp"
+#include "dsbase.hpp"
+#include <cuda_runtime.h>
+
+namespace impl {
+	struct GtMatchingCost : DSImplBase<float> {
+		typedef float Type;
+
+		GtMatchingCost(ushort w, ushort h, ushort dmin, ushort dmax) : DSImplBase<float>({w,h,dmin,dmax}) {}
+
+		__host__ __device__ inline float operator()(const int y, const int x, const int d) const {
+			if (round(gt(y,x)) == float(d)) { return 0.0f; }
+			else { return 1.0f; }
+		}
+
+		static constexpr float COST_MAX = 1.0f;
+
+		Array2D<float>::Data gt;
+	};
+}
+
+class GtMatchingCost : public DSBase<impl::GtMatchingCost> {
+public:
+	typedef impl::GtMatchingCost DataType;
+	typedef float Type;
+
+	GtMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	GtMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max) {}
+
+	void set(cv::InputArray gt) {
+		#if USE_GPU
+		if (gt.isGpuMat()) {
+			auto m = gt.getGpuMat();
+			gt_ = Array2D<float>(m);
+		}
+		else if (gt.isMat()) {
+			gt_.create(gt.cols(), gt.rows());
+			gt_.toGpuMat().upload(gt.getMat());
+		}
+		#else
+		if (gt.isGpuMat()) {
+			gt_.create(gt.cols(), gt.rows());
+			gt.getGpuMat().download(gt_.getMat());
+		}
+		else if (gt.isMat()) {
+			auto m = gt.getMat();
+			gt_ = Array2D<float>(m);
+		}
+		#endif
+
+		data().gt = gt_.data();
+	}
+
+	void set(const Array2D<float>& gt) {
+		gt_ = gt;
+		data().gt = gt_.data();
+	}
+
+protected:
+	Array2D<float> gt_;
+};
+
+#endif
diff --git a/lib/libstereo/src/stereo_gtsgm.cu b/lib/libstereo/src/stereo_gtsgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0826c5861fd27ebc2d38e38fbd25b22e96da0995
--- /dev/null
+++ b/lib/libstereo/src/stereo_gtsgm.cu
@@ -0,0 +1,132 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudaarithm.hpp>
+
+#include "stereo.hpp"
+
+#include "util_opencv.hpp"
+#include "costs/gt.hpp"
+#include "dsi.hpp"
+
+#include "wta.hpp"
+#include "cost_aggregation.hpp"
+#include "aggregations/standard_sgm.hpp"
+
+#include "median_filter.hpp"
+
+#ifdef __GNUG__
+
+#include <chrono>
+#include <iostream>
+
+static std::chrono::time_point<std::chrono::system_clock> start;
+
+static void timer_set() {
+		start = std::chrono::high_resolution_clock::now();
+}
+
+static void timer_print(const std::string &msg, const bool reset=true) {
+	auto stop = std::chrono::high_resolution_clock::now();
+
+	char buf[24];
+	snprintf(buf, sizeof(buf), "%5i ms  ",
+				(int) std::chrono::duration_cast<std::chrono::milliseconds>(stop-start).count());
+
+	std::cout << buf <<  msg << "\n" << std::flush;
+	if (reset) { timer_set(); }
+}
+
+#else
+
+static void timer_set() {}
+static void timer_print(const std::string &msg, const bool reset=true) {}
+
+#endif
+
+using cv::Mat;
+using cv::Size;
+using ftl::stereo::aggregations::StandardSGM;
+
+struct StereoGtSgm::Impl {
+	GtMatchingCost cost;
+	Array2D<float> cost_min_paths;
+	Array2D<float> uncertainty;
+	Array2D<float> disparity_r;
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	PathAggregator<StandardSGM<GtMatchingCost::DataType>> aggr;
+	WinnerTakesAll<DSImageFloat,float> wta;
+
+	Impl(int width, int height, int min_disp, int max_disp) :
+		cost(width, height, min_disp, max_disp),
+		cost_min_paths(width, height),
+		uncertainty(width, height),
+		disparity_r(width, height), l(width, height), r(width, height) {}
+
+};
+
+StereoGtSgm::StereoGtSgm() : impl_(nullptr) {
+	impl_ = new Impl(0, 0, 0, 0);
+}
+
+void StereoGtSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+	cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || l.cols() != impl_->cost.width()) {
+		throw std::exception();
+	}
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	timer_set();
+
+	// cost aggregation
+	StandardSGM<GtMatchingCost::DataType> func = {impl_->cost.data(), impl_->cost_min_paths.data(), params.P1, params.P2};
+	auto &out = impl_->aggr(func, params.paths);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("Aggregation"); }
+
+	impl_->wta(out, 0);
+	cudaSafeCall(cudaDeviceSynchronize());
+	if (params.debug) { timer_print("WTA"); }
+
+	// Drory, A., Haubold, C., Avidan, S., & Hamprecht, F. A. (2014).
+	// Semi-global matching: A principled derivation in terms of
+	// message passing. Lecture Notes in Computer Science (Including Subseries
+	// Lecture Notes in Artificial Intelligence and Lecture Notes in
+	// Bioinformatics). https://doi.org/10.1007/978-3-319-11752-2_4
+
+	if (disparity.isGpuMat()) {
+		auto uncertainty = impl_->uncertainty.toGpuMat();
+		cv::cuda::subtract(impl_->wta.min_cost.toGpuMat(), impl_->cost_min_paths.toGpuMat(), uncertainty);
+		cv::cuda::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+		impl_->wta.disparity.toGpuMat().setTo(0, uncertainty);
+	}
+	else {
+		auto uncertainty = impl_->uncertainty.toMat();
+		cv::subtract(impl_->wta.min_cost.toMat(), impl_->cost_min_paths.toMat(), uncertainty);
+		cv::compare(uncertainty, params.uniqueness, uncertainty, cv::CMP_GT);
+		impl_->wta.disparity.toMat().setTo(0, uncertainty);
+	}
+
+	median_filter(impl_->wta.disparity, disparity);
+	if (params.debug) { timer_print("median filter"); }
+}
+
+void StereoGtSgm::setPrior(cv::InputArray disp) {
+	if (disp.rows() != impl_->cost.height() || disp.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(disp.cols(), disp.rows(), params.d_min, params.d_max);
+	}
+	impl_->cost.set(disp);
+}
+
+StereoGtSgm::~StereoGtSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}