diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index cc1b10aefcaa83ff2fd2e563bb42f6dfb669a44b..35d2e7b4b9ba952c7f2f14cfbd88c5b6695f3d71 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -67,6 +67,8 @@ if (LIBSTEREO_SHARED)
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
                 src/dsi_tools.cu
+                src/costs/gct.cu
+                src/algorithms/gct.cu
     )
     set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp)
 
@@ -103,6 +105,8 @@ else()
                 src/median_filter.cu
                 src/median_filter_fixstars.cu
                 src/dsi_tools.cu
+                src/costs/gct.cu
+                src/algorithms/gct.cu
     )
 endif()
 
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index c596986b02bf3243f15a4f00db4b919a2fa062db..303d81509aefc7b6cb033f8fda4414315fa06f47 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -3,6 +3,36 @@
 #include <opencv2/core/mat.hpp>
 #include <stereo_types.hpp>
 
+class StereoGTCensusSgm {
+public:
+	StereoGTCensusSgm();
+	~StereoGTCensusSgm();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+	void setEdges();
+
+	struct Parameters {
+		int d_min = 0;
+		int d_max = 0;
+		unsigned short P1 = 5;
+		unsigned short P2 = 25;
+		float uniqueness = std::numeric_limits<unsigned short>::max();
+		int subpixel = 1; // subpixel interpolation method
+		bool lr_consistency = true;
+		int paths = AggregationDirections::HORIZONTAL |
+					AggregationDirections::VERTICAL |
+					AggregationDirections::DIAGONAL;
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
+
 class StereoADCensusSgm {
 public:
 	StereoADCensusSgm();
@@ -190,7 +220,7 @@ private:
 
 /**
  * STABLE Binary descriptor. This is a general implementation.
- * 
+ *
  * @see K. Valentín, R. Huber-Mörk, and S. Štolc, “Binary descriptor-based dense
  *      line-scan stereo matching,” J. Electron. Imaging, vol. 26, no. 1, 2017.
  */
@@ -410,7 +440,7 @@ private:
  * Ternary census, or 3 moded, where there is a noise threshold and where
  * pixels can be identified as no luminance change in addition to above or
  * below.
- * 
+ *
  * @see "TEXTURE-AWARE DENSE IMAGE MATCHING USING TERNARY CENSUS TRANSFORM" (2016)
  * @see "Local disparity estimation with three-moded cross census and advanced support weight" (2013)
  */
diff --git a/lib/libstereo/middlebury/algorithms.hpp b/lib/libstereo/middlebury/algorithms.hpp
index 604f0f57c73529ffa5ae6b88b9a42465f6e51a3f..f3878fe0ce8e14a6ab9d62a2ef6ef26ffda8382d 100644
--- a/lib/libstereo/middlebury/algorithms.hpp
+++ b/lib/libstereo/middlebury/algorithms.hpp
@@ -337,11 +337,30 @@ namespace Impl {
 		}
 	};
 
+	struct GCTSgm : public Algorithm {
+		GCTSgm() { P1 = 12.0f; P2 = 52.0f; }  // Tuned to total error 2.0
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoGTCensusSgm stereo;
+			stereo.params.P1 = P1;
+			stereo.params.P2 = P2;
+			stereo.params.subpixel = subpixel;
+			stereo.params.lr_consistency = lr_consistency;
+
+			stereo.params.debug = false;
+			stereo.params.d_min = data.calib.vmin;
+			stereo.params.d_max = data.calib.vmax;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
 }
 
 static const std::map<std::string, Algorithm*> algorithms = {
 	{ "censussgm", new Impl::CensusSGM() },
-	{ "mcensussgm", new Impl::MeanCensusSGM() },
+	{ "gctsgm", new Impl::GCTSgm() },
+
+/*	{ "mcensussgm", new Impl::MeanCensusSGM() },
 	{ "gcensussgm", new Impl::GCensusSGM() },
 	{ "ecensussgm", new Impl::ECensusSGM() },
 	{ "stablesgm", new Impl::StableSGM() },
@@ -354,5 +373,5 @@ static const std::map<std::string, Algorithm*> algorithms = {
 	{ "tcensussgm",  new Impl::TCensusSGM() },
 	{ "wcensussgm",  new Impl::WCensusSGM() },
 	{ "misgm",  new Impl::MiSGM() },
-	{ "varcensus",  new Impl::VarCensusSGM() },
+	{ "varcensus",  new Impl::VarCensusSGM() },*/
 };
diff --git a/lib/libstereo/src/algorithms/gct.cu b/lib/libstereo/src/algorithms/gct.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ebe8d906cbcd555ca561f3fa5ddeb9719d3917df
--- /dev/null
+++ b/lib/libstereo/src/algorithms/gct.cu
@@ -0,0 +1,67 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../costs/gct.hpp"
+
+struct StereoGTCensusSgm::Impl : public StereoSgm<GeneralizedCensusMatchingCost, StereoGTCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoGTCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoGTCensusSgm::StereoGTCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoGTCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	impl_->cost.setEdges({
+		{{-1,-1}, {1,1}},
+		{{1,-1}, {-1,1}},
+		{{0,-1}, {0,1}},
+		{{-1,0}, {1,0}},
+		{{-2,-2}, {2,2}},
+		{{2,-2}, {-2,2}},
+		{{0,-2}, {0,2}},
+		{{-2,0}, {2,0}}
+	});
+	/* wx * wy square window
+	std::vector<std::pair<std::pair<int, int>,std::pair<int, int>>> edges;
+	{
+		int wx = 7;
+		int wy = 7;
+
+		for (int x = -wx/2; x <= wx/2; x++) {
+			for (int y = -wy/2; y <= wy/2; y++) {
+				edges.push_back({{0,0},{y,x}});
+			}
+		}
+	}*/
+
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+	/*WinnerTakesAll<GeneralizedCensusMatchingCost> wta;
+	wta(impl_->cost, 0, true);
+
+	median_filter(wta.disparity, disparity);*/
+}
+
+StereoGTCensusSgm::~StereoGTCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
diff --git a/lib/libstereo/src/costs/census.cu b/lib/libstereo/src/costs/census.cu
index f56cedf608d3eb87ccbdf078f563dccd1f85d688..f2262e985361a5bd80eb272e67b2c8480a7629d1 100644
--- a/lib/libstereo/src/costs/census.cu
+++ b/lib/libstereo/src/costs/census.cu
@@ -127,6 +127,7 @@ namespace algorithms {
 					// zero if first value, otherwise shift to left
 					if (i % 64 == 0) { *out = 0; }
 					else             { *out = (*out << 1); }
+					// symmetric pattern? also redundant
 					*out |= (im(y-wy,x-wx) < (im(y+wy,x+wx)) ? 1 : 0);
 
 					i += 1;
@@ -249,7 +250,7 @@ void CensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
 		parallel2D<algorithms::GCensusTransformRowMajor<9,7>>({l.data(), ct_l_.data()}, l.width, l.height);
 		parallel2D<algorithms::GCensusTransformRowMajor<9,7>>({r.data(), ct_r_.data()}, r.width, r.height);
 	} else {
-		// TODO: 
+		// TODO:
 	}
 }
 
@@ -327,7 +328,7 @@ void MiniCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &
 		parallel2D<algorithms::GCensusTransformRowMajor<5,3>>({l.data(), ct_l_.data()}, l.width, l.height);
 		parallel2D<algorithms::GCensusTransformRowMajor<5,3>>({r.data(), ct_r_.data()}, r.width, r.height);
 	} else {
-		// TODO: 
+		// TODO:
 	}
 }
 
diff --git a/lib/libstereo/src/costs/census.hpp b/lib/libstereo/src/costs/census.hpp
index 4600b64a26674b5a23a9d77b65b210e618eabd63..4a74e2e82d629aca81af6955119628f9bd16c25f 100644
--- a/lib/libstereo/src/costs/census.hpp
+++ b/lib/libstereo/src/costs/census.hpp
@@ -5,8 +5,8 @@
 #include "array2d.hpp"
 #include "dsbase.hpp"
 #include <stereo_types.hpp>
-
 #include <cuda_runtime.h>
+
 namespace impl {
 	__host__ __device__ static inline uint64_t popcount(const uint64_t bits) {
 		#if defined(__CUDA_ARCH__)
diff --git a/lib/libstereo/src/costs/gct.cu b/lib/libstereo/src/costs/gct.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5d76340b969d16e4c0a37ce6d44820dc2ac7caae
--- /dev/null
+++ b/lib/libstereo/src/costs/gct.cu
@@ -0,0 +1,171 @@
+#include "gct.hpp"
+#include "../util.hpp"
+
+static const int WX = 11;
+static const int WY = 11;
+
+namespace algorithms {
+	/** Fife, W. S., & Archibald, J. K. (2012). Improved census transforms for
+	* resource-optimized stereo vision. IEEE Transactions on Circuits and
+	* Systems for Video Technology, 23(1), 60-73.
+	*/
+	template<int WINX, int WINY>
+	struct GeneralizedCensusTransform {
+		__host__ __device__ inline void window(const int y, const int x, uint64_t* __restrict__ out) {
+
+			uint8_t i = 0; // bit counter for *out
+
+			for (int e = 0; e < nedges; e++) {
+
+				// edges contain window indices, calculate window coordinates
+				const int y1 = y + edges(e,0) % WINY - WINY/2;
+				const int x1 = x + edges(e,0) / WINY - WINX/2;
+
+				const int y2 = y + edges(e,1) % WINY - WINY/2;
+				const int x2 = x + edges(e,1) / WINY - WINX/2;
+
+				// zero if first value, otherwise shift to left
+				if (i % 64 == 0) { *out = 0; }
+				else             { *out = (*out << 1); }
+				*out |= (im(y1,x1) < (im(y2,x2)) ? 1 : 0);
+
+				i += 1;
+				// if all bits set, continue to next element
+				if (i % 64 == 0) { out++; }
+			}
+		}
+
+		__host__ __device__  void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+			for (int y = thread.y+WINY/2; y<size.y-WINY/2-1; y+=stride.y) {
+				for (int x = thread.x+WINX/2; x<size.x-WINX/2-1; x+=stride.x) {
+					window(y, x, &(out(y, x*WSTEP)));
+				}
+			}
+		}
+
+		int nedges;
+		Array2D<uchar>::Data edges;
+		Array2D<uchar>::Data im;
+		Array2D<uint64_t>::Data out;
+
+		// number of uint64_t values for each window
+		static constexpr int WSTEP = (WINX*WINY - 1)/(sizeof(uint64_t)*8) + 1;
+	};
+}
+
+void GeneralizedCensusMatchingCost::set(const Array2D<uchar> &l, const Array2D<uchar> &r) {
+
+	parallel2D<algorithms::GeneralizedCensusTransform<WX,WY>>({edges_.height, edges_.data(), l.data(), ct_l_.data()}, l.width, l.height);
+	parallel2D<algorithms::GeneralizedCensusTransform<WX,WY>>({edges_.height, edges_.data(), r.data(), ct_r_.data()}, r.width, r.height);
+}
+
+void GeneralizedCensusMatchingCost::setEdges(const std::vector<std::pair<int, int>> &edges) {
+	if (edges.size() >= (WX * WY)) {
+		throw std::exception(); // too many edges
+	}
+
+	cv::Mat data(cv::Size(2, edges.size()), CV_8UC1);
+	for (size_t i = 0; i < edges.size(); i++) {
+		if (edges[i].first < 0 || edges[0].second < 0) {
+			throw std::exception(); // indices must be positive
+		}
+
+		data.at<uchar>(i,0) = edges[i].first;
+		data.at<uchar>(i,1) = edges[i].second;
+	}
+
+	edges_.create(2, edges.size());
+	#ifdef USE_GPU
+	edges_.toGpuMat().upload(data);
+	#else
+	data.copyTo(edges_.toMat());
+	#endif
+}
+
+void GeneralizedCensusMatchingCost::setEdges(const std::vector<std::pair<std::pair<int, int>,std::pair<int, int>>> &edges) {
+	std::vector<std::pair<int, int>> edges_idx;
+	for (const auto& p : edges) {
+		const auto& p1 = p.first;
+		const auto& p2 = p.second;
+
+		int i1 = p1.first * WX + p1.second + (WX*WY-1)/2;
+		int i2 = p2.first * WX + p2.second + (WX*WY-1)/2;
+		printf("i1: %i, i2: %i\n", i1, i2);
+		edges_idx.push_back({i1, i2});
+	}
+
+	/* TODO: move to unit test
+	for (int i = 0; i < edges.size(); i++) {
+		auto p = edges_idx[i];
+
+		const int y1 = p.first % WY - WY/2;
+		const int x1 = p.first / WY - WX/2;
+
+		const int y2 = p.second % WY - WY/2;
+		const int x2 = p.second / WY - WX/2;
+		printf("(%i,%i), (%i,%i)\n", y1, x1, y2, x2);
+	}*/
+
+	setEdges(edges_idx);
+}
+
+void GeneralizedCensusMatchingCost::set(cv::InputArray l, cv::InputArray r) {
+	if (l.type() != CV_8UC1 || r.type() != CV_8UC1) { throw std::exception(); }
+	if (l.rows() != r.rows() || l.cols() != r.cols() || l.rows() != height() || l.cols() != width()) {
+		throw std::exception();
+	}
+
+	if (l.isGpuMat() && r.isGpuMat()) {
+		auto ml = l.getGpuMat();
+		auto mr = r.getGpuMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else if (l.isMat() && r.isMat()) {
+		auto ml = l.getMat();
+		auto mr = r.getMat();
+		set(Array2D<uchar>(ml), Array2D<uchar>(mr));
+	}
+	else {
+		throw std::exception();
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/*
+struct StereoCensusSgm::Impl : public StereoSgm<CensusMatchingCost, StereoCensusSgm::Parameters> {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+
+	Impl(StereoCensusSgm::Parameters &params, int width, int height, int dmin, int dmax) :
+		StereoSgm(params, width, height, dmin, dmax), l(width, height), r(width, height) {}
+};
+
+StereoCensusSgm::StereoCensusSgm() : impl_(nullptr) {
+	impl_ = new Impl(params, 0, 0, 0, 0);
+}
+
+void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->cost.height() || r.cols() != impl_->cost.width()) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(params, l.cols(), l.rows(), params.d_min, params.d_max);
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+	impl_->cost.setPattern(params.pattern);
+	impl_->cost.set(impl_->l, impl_->r);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	impl_->compute(disparity);
+}
+
+StereoCensusSgm::~StereoCensusSgm() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
+*/
diff --git a/lib/libstereo/src/costs/gct.hpp b/lib/libstereo/src/costs/gct.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..90f7d855556e4cb386fcf9fd0eb3b23af89c4f64
--- /dev/null
+++ b/lib/libstereo/src/costs/gct.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "../dsbase.hpp"
+#include "../array2d.hpp"
+#include "census.hpp"
+
+namespace impl {
+	template<uint8_t WMAX>
+	using GeneralizedCensusMatchingCost = HammingCost<WMAX*WMAX>;
+}
+
+class GeneralizedCensusMatchingCost : public DSBase<impl::GeneralizedCensusMatchingCost<11>> {
+public:
+	typedef impl::GeneralizedCensusMatchingCost<11> DataType;
+	typedef unsigned short Type;
+
+	GeneralizedCensusMatchingCost() : DSBase<DataType>(0, 0, 0, 0) {};
+	GeneralizedCensusMatchingCost(int width, int height, int disp_min, int disp_max)
+		: DSBase<DataType>(width, height, disp_min, disp_max),
+			ct_l_(width*data().WSTEP, height), ct_r_(width*data().WSTEP,height)
+		{
+			data().l = ct_l_.data();
+			data().r = ct_r_.data();
+		}
+
+	// pairs of indices (window size 11x11, origin top left)
+	void setEdges(const std::vector<std::pair<int, int>> &edges);
+	// pairs of (y,x) coordinates (relative to window, origin at center)
+	void setEdges(const std::vector<std::pair<std::pair<int, int>,std::pair<int, int>>> &edges);
+
+	void set(cv::InputArray l, cv::InputArray r);
+	void set(const Array2D<uchar>& l, const Array2D<uchar>& r);
+	static constexpr Type COST_MAX = DataType::COST_MAX;
+
+protected:
+	Array2D<uint64_t> ct_l_;
+	Array2D<uint64_t> ct_r_;
+	Array2D<uchar> edges_;
+};
diff --git a/lib/libstereo/src/dsbase.hpp b/lib/libstereo/src/dsbase.hpp
index d2f6fc4636d1be471aaa6f4e2762ff4a9e6c6756..bec96148cab3ab2553214927fbc2631b59ae8f79 100644
--- a/lib/libstereo/src/dsbase.hpp
+++ b/lib/libstereo/src/dsbase.hpp
@@ -1,6 +1,7 @@
 #ifndef _FTL_LIBSTEREO_DSBASE_HPP_
 #define _FTL_LIBSTEREO_DSBASE_HPP_
 
+#include <cstdint>
 #include <cuda_runtime.h>
 #include <type_traits>