diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt
index f60cc79a1b7f55d237c911fc97833b8db465da4c..a5f3255d2ca0f72a0df3e04c71b639de6984e946 100644
--- a/lib/libstereo/CMakeLists.txt
+++ b/lib/libstereo/CMakeLists.txt
@@ -49,7 +49,8 @@ if (LIBSTEREO_SHARED)
                 src/algorithms/hwcensussgm.cu
                 src/algorithms/brefcensussgm.cu
                 src/algorithms/meancensussgm.cu
-                src/algorithms/hstablesgm.cu
+				src/algorithms/hstablesgm.cu
+				src/algorithms/clustersf.cu
                 #src/stereo_hier_census.cu
                 src/stereo_wcensussgm.cu
                 src/stereo_census_adaptive.cu
@@ -87,7 +88,8 @@ else()
                 src/algorithms/hwcensussgm.cu
                 src/algorithms/brefcensussgm.cu
                 src/algorithms/meancensussgm.cu
-                src/algorithms/hstablesgm.cu
+				src/algorithms/hstablesgm.cu
+				src/algorithms/clustersf.cu
                 #src/stereo_hier_census.cu
                 src/stereo_wcensussgm.cu
                 src/stereo_census_adaptive.cu
diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp
index e5680c0f997146902c58aaebd8a434b15a820ef6..df9ccdd291b28dd924fd40144998b6e807c82431 100644
--- a/lib/libstereo/include/stereo.hpp
+++ b/lib/libstereo/include/stereo.hpp
@@ -240,6 +240,25 @@ private:
 	Impl *impl_;
 };
 
+/* Clustered Salient Features with focal point */
+class StereoCSF {
+public:
+	StereoCSF();
+	~StereoCSF();
+
+	void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity);
+	void setPrior(cv::InputArray disp) {};
+
+	struct Parameters {
+		bool debug = false;
+	};
+	Parameters params;
+
+private:
+	struct Impl;
+	Impl *impl_;
+};
+
 /**
  * STABLE Binary descriptor. This is a general implementation.
  *
diff --git a/lib/libstereo/middlebury/algorithms.hpp b/lib/libstereo/middlebury/algorithms.hpp
index 0b510df4a46349397b8b738884f570e7c1740df6..094976d0cceac076bca68ce884d3e60321ef5d8e 100644
--- a/lib/libstereo/middlebury/algorithms.hpp
+++ b/lib/libstereo/middlebury/algorithms.hpp
@@ -32,6 +32,15 @@ namespace Impl {
 		}
 	};
 
+	struct ClusterSF : public Algorithm {
+		ClusterSF() {  }
+
+		virtual void run(const MiddleburyData &data, cv::Mat &disparity) override {
+			StereoCSF stereo;
+			stereo.compute(data.imL, data.imR, disparity);
+		}
+	};
+
 	struct ECensusSGM : public Algorithm {
 		ECensusSGM() { P1 = 8.0f; P2 = 40.0f; }  // Tuned to total error 2.0
 
@@ -361,6 +370,8 @@ namespace Impl {
 
 static const std::map<std::string, Algorithm*> algorithms = {
 	{ "censussgm", new Impl::CensusSGM() },
+	{ "cluster", new Impl::ClusterSF() },
+	{ "mcensussgm", new Impl::MeanCensusSGM() },
 	{ "gctsgm", new Impl::GCTSgm() },
 
 /*	{ "mcensussgm", new Impl::MeanCensusSGM() },
diff --git a/lib/libstereo/src/algorithms/clustersf.cu b/lib/libstereo/src/algorithms/clustersf.cu
new file mode 100644
index 0000000000000000000000000000000000000000..332a8e27b1737796dd0eeecb3760d5c460f32b6a
--- /dev/null
+++ b/lib/libstereo/src/algorithms/clustersf.cu
@@ -0,0 +1,143 @@
+#include "stereo.hpp"
+#include "stereosgm.hpp"
+#include "../filters/salient_gradient.hpp"
+#include "../filters/focal_cluster.hpp"
+
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/cudaarithm.hpp>
+
+struct StereoCSF::Impl {
+	Array2D<uchar> l;
+	Array2D<uchar> r;
+	Array2D<uchar> gl;
+	Array2D<uchar> gr;
+	Array2D<uchar> temp;
+	Bucket1D<short2, 64> buckets_l;
+	Bucket2D<ushort, 64> buckets_r;
+	Array1D<int> focal;
+
+	Impl(int width, int height) :
+		l(width, height), r(width, height),
+		gl(width, height), gr(width, height), temp(width, height),
+		buckets_l(height), buckets_r(16, height), focal(1024) {}
+};
+
+StereoCSF::StereoCSF() : impl_(nullptr) {
+	impl_ = new Impl(0, 0);
+}
+
+void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) {
+
+	//cudaSetDevice(0);
+
+	if (l.rows() != impl_->l.height || r.cols() != impl_->l.width) {
+		delete impl_; impl_ = nullptr;
+		impl_ = new Impl(l.cols(), l.rows());
+	}
+
+	mat2gray(l, impl_->l);
+	mat2gray(r, impl_->r);
+
+	Array2D<float> disp_array(l.cols(), l.rows());
+	disp_array.toGpuMat().setTo(cv::Scalar(0.0f));
+
+	Array2D<float> conf_array(l.cols(), l.rows());
+	conf_array.toGpuMat().setTo(cv::Scalar(0.0f));
+	
+	//int fx=1000;
+	//int fy=800;
+	for (int fx = 200; fx < l.cols()-200; fx += 50) {
+	for (int fy = 200; fy < l.rows()-200; fy += 50) {
+		short2 focal_pt_est = {short(l.cols()/2), short(l.rows()/2)};
+		int focal_radius_est = 100000;
+		int focal_radius = 1000;
+
+		for (int i=0; i<3; ++i) {
+			SalientGradientGrouped sgr = {focal_pt_est, focal_radius_est, impl_->r.data(), impl_->gr.data(), impl_->temp.data(), impl_->buckets_r.data(), impl_->r.width, impl_->r.height};
+			parallel1DWarpSM(sgr, r.rows(), r.cols());
+
+			short2 focal_pt = {short(fx), short(fy)};
+			SalientGradient sgl = {focal_pt, focal_radius, impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->buckets_l.data(), impl_->l.width, impl_->l.height};
+			parallel1DWarpSM(sgl, l.rows(), l.cols());
+			impl_->focal.toGpuMat().setTo(cv::Scalar(0));	
+
+			FocalCluster fc = {focal_pt, impl_->buckets_l.data(), impl_->buckets_r.data(), impl_->focal.data(), 1024};
+			parallel1DWarp(fc, l.rows(), 1);
+
+			cv::Point max_disp;
+			cv::cuda::minMaxLoc(impl_->focal.toGpuMat(), nullptr, nullptr, nullptr, &max_disp);
+
+			FocalSelector fs = {focal_pt, int(max_disp.x), impl_->buckets_l.data(), impl_->buckets_r.data(), impl_->focal.data(), disp_array.data(), conf_array.data(), 1024};
+			parallel1DWarp(fs, l.rows(), 1);
+
+			// Update right focal point estimate and radius
+			focal_pt_est.y = focal_pt.y;
+			focal_pt_est.x = focal_pt.x-max_disp.x;
+			focal_radius_est = focal_radius;
+			focal_radius /= 2;
+			//std::cout << "Focal disparity = " << max_disp.x << std::endl;
+		}
+	}
+	}
+
+	disp_array.toGpuMat().download(disparity);
+
+	cv::Mat gradtmp;
+	conf_array.toGpuMat().download(gradtmp);
+	gradtmp.convertTo(gradtmp, CV_8UC1, 255.0);
+	cv::applyColorMap(gradtmp, gradtmp, cv::COLORMAP_INFERNO);
+	cv::resize(gradtmp,gradtmp, cv::Size(gradtmp.cols/2, gradtmp.rows/2));
+	cv::imshow("Confidence", gradtmp);
+
+	cv::Mat tmp;
+	impl_->focal.toGpuMat().download(tmp);
+
+	double minval;
+	double maxval;
+	int minloc[2];
+	int maxloc[2];
+	cv::minMaxIdx(tmp, &minval, &maxval, minloc, maxloc);
+
+	std::cout << "Focal Disparity = " << maxloc[1] << std::endl;
+
+	tmp.convertTo(tmp, CV_8UC1, 255.0/maxval);
+	cv::resize(tmp,tmp, cv::Size(tmp.cols, 100));
+
+	//#if OPENCV_VERSION >= 40102
+	//cv::applyColorMap(tmp, tmp, cv::COLORMAP_TURBO);
+	//#else
+	cv::applyColorMap(tmp, tmp, cv::COLORMAP_INFERNO);
+	//#endif
+	cv::imshow("Disparity Hist", tmp);
+
+	cv::Mat imgleft, imgright;
+	impl_->l.toGpuMat().download(imgleft);
+	impl_->r.toGpuMat().download(imgright);
+
+	cv::cvtColor(imgleft,imgleft, cv::COLOR_GRAY2BGR);
+	cv::cvtColor(imgright,imgright, cv::COLOR_GRAY2BGR);
+	cv::drawMarker(imgleft, cv::Point(1000,800), cv::Scalar(0,0,255));
+	cv::drawMarker(imgright, cv::Point(1000-maxloc[1],800), cv::Scalar(0,0,255));
+
+	cv::resize(imgleft,imgleft, cv::Size(imgleft.cols/2, imgleft.rows/2));
+	cv::resize(imgright,imgright, cv::Size(imgright.cols/2, imgright.rows/2));
+	cv::imshow("Left Focal", imgleft);
+	cv::imshow("Right Focal", imgright);
+
+	//impl_->gr.toGpuMat().download(tmp);
+	//cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2));
+	//cv::imshow("Gradients Right", tmp);
+
+	cudaSafeCall(cudaDeviceSynchronize());
+	disparity.create(l.rows(), l.cols(), CV_32F);
+	cv::waitKey(10000);
+}
+
+StereoCSF::~StereoCSF() {
+	if (impl_) {
+		delete impl_;
+		impl_ = nullptr;
+	}
+}
+
diff --git a/lib/libstereo/src/array1d.hpp b/lib/libstereo/src/array1d.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2bdb80e3aa295df98505fe32814493e1e67ed55
--- /dev/null
+++ b/lib/libstereo/src/array1d.hpp
@@ -0,0 +1,137 @@
+#ifndef _FTL_LIBSTEREO_ARRAY1D_HPP_
+#define _FTL_LIBSTEREO_ARRAY1D_HPP_
+
+#include "memory.hpp"
+
+template<typename T>
+class Array1D {
+public:
+	Array1D() : width(0), needs_free_(false) {
+		data_.data = nullptr;
+	}
+
+	Array1D(int w) : width(w), needs_free_(true) {
+		data_.data = allocateMemory<T>(w);
+	}
+
+	/*explicit Array1D(cv::Mat &m) : needs_free_(false) {
+		#ifdef USE_GPU
+		create(m.cols, m.rows);
+		cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyHostToDevice));
+		#else
+		needs_free_ = false;
+		data_.data = (T*)m.data;
+		data_.pitch = m.step / sizeof(T);
+		width = m.cols;
+		height = m.rows;
+		#endif
+	}
+
+	explicit Array2D(cv::cuda::GpuMat &m) : needs_free_(false) {
+		#ifdef USE_GPU
+		needs_free_ = false;
+		data_.data = (T*)m.data;
+		data_.pitch = m.step / sizeof(T);
+		width = m.cols;
+		height = m.rows;
+		#else
+		create(m.cols, m.rows);
+		cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyDeviceToHost));
+		#endif
+	}*/
+
+	~Array1D() {
+		free();
+	}
+
+	void free() {
+		if (needs_free_ && data_.data) freeMemory(data_.data);
+	}
+
+	Array1D<T> &operator=(const Array1D<T> &c) {
+		data_ = c.data_;
+		width = c.width;
+		needs_free_ = false;
+		return *this;
+	}
+
+	struct Data {
+		__host__ __device__ inline T& operator() (const int x) {
+			return data[x];
+		}
+
+		__host__ __device__ inline const T& operator() (const int x) const {
+			return data[x];
+		}
+
+		T *data;
+	};
+
+	void create(int w) {
+		if (w == width) return;
+		width = w;
+		free();
+		needs_free_ = true;
+		data_.data = allocateMemory<T>(w);
+	}
+
+	inline Data &data() { return data_; }
+	inline const Data &data() const { return data_; }
+
+	void toMat(cv::Mat &m) {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		gm.download(m);
+		#else
+		m = cv::Mat(1, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	cv::Mat toMat() {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(1, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	const cv::Mat toMat() const {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm(1, width, cv::traits::Type<T>::value, (void*)data_.data);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(1, width, cv::traits::Type<T>::value, data_.data);
+		#endif
+	}
+
+	void toGpuMat(cv::cuda::GpuMat &m) {
+		#ifdef USE_GPU
+		m = cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value, (void*)data_.data);
+		#else
+		// TODO
+		#endif
+	}
+
+	cv::cuda::GpuMat toGpuMat() {
+		#ifdef USE_GPU
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value, (void*)data_.data);
+		#else
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value);
+		#endif
+	}
+
+	int width;
+
+private:
+	Data data_;
+	bool needs_free_;
+};
+
+#endif
diff --git a/lib/libstereo/src/bucket1d.hpp b/lib/libstereo/src/bucket1d.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..77f8ff9621112c0adbfb19eb1fbb093ac4e768c8
--- /dev/null
+++ b/lib/libstereo/src/bucket1d.hpp
@@ -0,0 +1,130 @@
+#ifndef _FTL_LIBSTEREO_BUCKET1D_HPP_
+#define _FTL_LIBSTEREO_BUCKET1D_HPP_
+
+#include "memory.hpp"
+
+template<typename T, int DEPTH>
+class Bucket1D {
+public:
+	Bucket1D() : width(0), needs_free_(false) {
+		data_.buckets = nullptr;
+		data_.counters = nullptr;
+	}
+
+	Bucket1D(int w) : width(w), needs_free_(true) {
+		data_.counters = allocateMemory<uint>(w);
+		data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch);
+	}
+
+	~Bucket1D() {
+		free();
+	}
+
+	void free() {
+		if (needs_free_ && data_.buckets) {
+			freeMemory(data_.counters);
+			freeMemory(data_.buckets);
+		}
+	}
+
+	Bucket1D<T, DEPTH> &operator=(const Bucket1D<T, DEPTH> &c) {
+		data_ = c.data_;
+		width = c.width;
+		needs_free_ = false;
+		return *this;
+	}
+
+	struct Data {
+		__host__ __device__ inline uint& operator() (const int y) {
+			return counters[y];
+		}
+
+		__host__ __device__ inline const uint& operator() (const int y) const {
+			return counters[y];
+		}
+
+		__host__ __device__ inline const T& operator() (const int y, int d) const {
+			return buckets[d + y*bucket_pitch];
+		}
+
+		__host__ __device__ inline void add (const int y, T v) {
+			uint ix = atomicInc(&counters[y], DEPTH);
+			buckets[ix + y*bucket_pitch] = v;
+		}
+
+		__host__ __device__ inline const T *ptr(const int y) const { return &buckets[y*bucket_pitch]; }
+
+		uint bucket_pitch;
+		T *buckets;
+		uint *counters;
+	};
+
+	void create(int w) {
+		if (w == width) return;
+		width = w;
+		free();
+		needs_free_ = true;
+		data_.counters = allocateMemory<uint>(w);
+		data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch);
+	}
+
+	inline Data &data() { return data_; }
+	inline const Data &data() const { return data_; }
+
+	void toMat(cv::Mat &m) {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		gm.download(m);
+		#else
+		m = cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	cv::Mat toMat() {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	const cv::Mat toMat() const {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	void toGpuMat(cv::cuda::GpuMat &m) {
+		#ifdef USE_GPU
+		m = cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		#else
+		// TODO
+		#endif
+	}
+
+	cv::cuda::GpuMat toGpuMat() {
+		#ifdef USE_GPU
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		#else
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value);
+		#endif
+	}
+
+	int width;
+
+private:
+	Data data_;
+	bool needs_free_;
+};
+
+#endif
diff --git a/lib/libstereo/src/bucket2d.hpp b/lib/libstereo/src/bucket2d.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6eb71d42e3cd74a62fd38f05bd836c23a1f3cd56
--- /dev/null
+++ b/lib/libstereo/src/bucket2d.hpp
@@ -0,0 +1,136 @@
+#ifndef _FTL_LIBSTEREO_BUCKET2D_HPP_
+#define _FTL_LIBSTEREO_BUCKET2D_HPP_
+
+#include "memory.hpp"
+
+template<typename T, int DEPTH>
+class Bucket2D {
+public:
+	Bucket2D() : width(0), height(0), needs_free_(false) {
+		data_.buckets = nullptr;
+		data_.counters = nullptr;
+	}
+
+	Bucket2D(int w, int h) : width(w), height(h), needs_free_(true) {
+		data_.counters = allocateMemory2D<uint>(w, h, data_.counter_pitch);
+		data_.buckets = allocateMemory2D<T>(w*DEPTH, h, data_.bucket_pitch);
+	}
+
+	~Bucket2D() {
+		free();
+	}
+
+	void free() {
+		if (needs_free_ && data_.buckets) {
+			freeMemory(data_.counters);
+			freeMemory(data_.buckets);
+		}
+	}
+
+	Bucket2D<T, DEPTH> &operator=(const Bucket2D<T, DEPTH> &c) {
+		data_ = c.data_;
+		width = c.width;
+		height = c.height;
+		needs_free_ = false;
+		return *this;
+	}
+
+	struct Data {
+		__host__ __device__ inline uint& operator() (const int y, const int x) {
+			return counters[x + y*counter_pitch];
+		}
+
+		__host__ __device__ inline const uint& operator() (const int y, const int x) const {
+			return counters[x + y*counter_pitch];
+		}
+
+		__host__ __device__ inline const T& operator() (const int y, const int x, int d) const {
+			return buckets[d + x*DEPTH + y*bucket_pitch];
+		}
+
+		__host__ __device__ inline void add (const int y, const int x, T v) {
+			uint ix = atomicInc(&counters[x + y*counter_pitch], DEPTH);
+			buckets[ix + x*DEPTH + y*bucket_pitch] = v;
+		}
+
+		__host__ __device__ inline uint *ptr(const int y) { return &counters[y*counter_pitch]; }
+		__host__ __device__ inline const uint *ptr(const int y) const { return &counters[y*counter_pitch]; }
+		__host__ __device__ inline const T *ptr(const int y, const int x) const { return &buckets[x*DEPTH+y*bucket_pitch]; }
+
+		uint bucket_pitch;
+		uint counter_pitch;
+		T *buckets;
+		uint *counters;
+	};
+
+	void create(int w, int h) {
+		if (w == width && h == height) return;
+		width = w;
+		height = h;
+		free();
+		needs_free_ = true;
+		data_.counters = allocateMemory2D<uint>(w, h, data_.counter_pitch);
+		data_.buckets = allocateMemory2D<T>(w*DEPTH, h, data_.bucket_pitch);
+	}
+
+	inline Data &data() { return data_; }
+	inline const Data &data() const { return data_; }
+
+	void toMat(cv::Mat &m) {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		gm.download(m);
+		#else
+		m = cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	cv::Mat toMat() {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	const cv::Mat toMat() const {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint)));
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	void toGpuMat(cv::cuda::GpuMat &m) {
+		#ifdef USE_GPU
+		m = cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint)));
+		#else
+		// TODO
+		#endif
+	}
+
+	cv::cuda::GpuMat toGpuMat() {
+		#ifdef USE_GPU
+		return cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint)));
+		#else
+		return cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value);
+		#endif
+	}
+
+	int width;
+	int height;
+
+private:
+	Data data_;
+	bool needs_free_;
+};
+
+#endif
diff --git a/lib/libstereo/src/filters/focal_cluster.hpp b/lib/libstereo/src/filters/focal_cluster.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..32cd5be1a0cf9f016bc2dd9d8a2c07dfa45126bb
--- /dev/null
+++ b/lib/libstereo/src/filters/focal_cluster.hpp
@@ -0,0 +1,90 @@
+#ifndef _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_
+#define _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_
+
+#include "../util.hpp"
+#include "../array1d.hpp"
+#include "../array2d.hpp"
+#include "../bucket1d.hpp"
+#include "../bucket2d.hpp"
+
+struct FocalCluster {
+	short2 focal_pt;
+	Bucket1D<short2, 64>::Data left;
+	Bucket2D<ushort, 64>::Data right;
+	Array1D<int>::Data histogram;
+
+	int max_disparity = 1024;
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			int count = left(y);
+			// Stride a warp of threads over the features
+			for (int f=thread.x; f<count; f+=stride.x) {
+				// For each feature or features near to focal point
+				short2 feature = left(y,f);
+				int distx = feature.x - focal_pt.x;
+				
+				// - For each feature in right image that matches
+				const ushort *ptr = right.ptr(y, feature.y);
+				int count2 = right(y,feature.y);
+				for (int i=0; i<count2; ++i) {
+					//   - Add focal dist to feature X and add to histogram
+					int disparity = max(0,focal_pt.x - int(ptr[i]) + distx);
+					if (disparity < max_disparity && disparity > 0) atomicAdd(&histogram(disparity), 1);
+				}
+			}
+		}
+	}
+};
+
+struct FocalSelector {
+	short2 focal_pt;
+	int focal_disp;
+	Bucket1D<short2, 64>::Data left;
+	Bucket2D<ushort, 64>::Data right;
+	Array1D<int>::Data histogram;
+	Array2D<float>::Data disparity;
+	Array2D<float>::Data confidence;
+
+	int max_disparity = 1024;
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			int count = left(y);
+			// Stride a warp of threads over the features
+			for (int f=thread.x; f<count; f+=stride.x) {
+				// For each feature or features near to focal point
+				short2 feature = left(y,f);
+				int distx = feature.x - focal_pt.x;
+
+				int best_disp = 0;
+				int max_v = 0;
+				
+				// - For each feature in right image that matches
+				const ushort *ptr = right.ptr(y, feature.y);
+				int count2 = right(y,feature.y);
+				for (int i=0; i<count2; ++i) {
+					//   - Add focal dist to feature X and add to histogram
+					int d = max(0,focal_pt.x - int(ptr[i]) + distx);
+					
+					int v = histogram(d);
+					if (v > max_v) {
+						max_v = v;
+						best_disp = d;
+					}
+				}
+
+				if (max_v > 0) {
+					//float conf = 1.0f - min(1.0f, float(abs(best_disp-focal_disp)) / 10.0f);
+					float conf = 1.0f - min(1.0f, float(abs(distx)) / 500.0f);
+					if (conf > confidence(y,feature.x)) {
+						disparity(y,feature.x) = float(best_disp);
+						confidence(y,feature.x) = conf;
+					}
+				}
+			}
+		}
+	}
+};
+
+#endif
diff --git a/lib/libstereo/src/filters/salient_gradient.hpp b/lib/libstereo/src/filters/salient_gradient.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..43454264f3bf455bee1913718b13343c9c6ffd8e
--- /dev/null
+++ b/lib/libstereo/src/filters/salient_gradient.hpp
@@ -0,0 +1,225 @@
+#ifndef _FTL_LIBSTEREO_FILTERS_SALIENT_GRADIENT_HPP_
+#define _FTL_LIBSTEREO_FILTERS_SALIENT_GRADIENT_HPP_
+
+#include "../util.hpp"
+#include "../array2d.hpp"
+#include "../bucket2d.hpp"
+#include "../bucket1d.hpp"
+
+/**
+ * Select salient gradient features and gather into scanline buckets that
+ * includes the x-coordinate and feature orientation. Not grouped by orientation.
+ * 
+ * TODO: This needs to be a focal point modified version. Radius from focal
+ * point determines threshold, but the radius itself is determined by feature
+ * density around the focal point ... ultimately keeping the number of features
+ * within a reasonable very small level per scanline (say 2-8).
+ */
+struct SalientGradient {
+	short2 focal_pt;
+	int radius;
+	Array2D<uchar>::Data image;
+	Array2D<uchar>::Data angle;
+	Array2D<uchar>::Data magnitude;
+	Bucket1D<short2, 64>::Data buckets;
+
+	int width, height;
+
+	__cuda__ inline float2 calculateGradient(int x, int y) {
+		if (x < 1 || y < 1 || x >= width-1 || y >= height-1) return make_float2(0,0);
+
+		float dx = -1.0f*float(image(y-1,x-1)) + -2.0f*float(image(y, x-1)) + -1*float(image(y+1, x-1)) +
+             float(image(y-1, x+1)) + 2.0f*float(image(y, x+1)) + float(image(y+1, x+1));
+        float dy = float(image(y-1, x-1)) + 2.0f*float(image(y-1, x)) + float(image(y-1, x+1)) +
+             -1.0f*float(image(y+1, x-1)) + -2.0f*float(image(y+1, x)) + -1.0f*float(image(y+1, x+1));
+
+        float g = sqrt( (dx*dx) + (dy*dy) );
+		float a = atan2(dy, dx);
+		return make_float2(g,a);
+	}
+
+	struct WarpSharedMemory {
+		int gradient_histogram[32];
+	};
+
+	inline __device__ int scan(volatile int *s_Data, int thread, int threshold) {
+		for (uint offset = 1; offset < 32; offset <<= 1) {
+			__syncwarp();
+			uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
+			__syncwarp();
+			s_Data[thread] = t;
+		}
+
+		uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold);
+		return __ffs(t);
+	}
+
+	__device__ inline float weighting(float r, float h) {
+		if (r >= h) return 0.0f;
+		float rh = r / h;
+		rh = 1.0f - rh*rh;
+		return rh*rh*rh*rh;
+	}
+
+	__device__ inline float calcWeight(int x, int y) {
+		float dx = focal_pt.x - x;
+		float dy = focal_pt.y - y;
+		float dist = sqrt(dx*dx + dy*dy);
+		float weight = weighting(dist, float(radius));
+		return weight;
+	}
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) {
+		static constexpr float PI = 3.14f;
+		static constexpr float PI2 = PI/2.0f;
+
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			// Reset histogram
+			//for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0;
+			wsm.gradient_histogram[thread.x] = 0;
+			//int maxmag = 0;
+
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				auto g = calculateGradient(x,y);
+				//float weight = calcWeight(x,y);
+				angle(y,x) = uchar(min(15,int((g.y+PI2) / PI * 16.0f)));
+				magnitude(y,x) = uchar(g.x);
+
+				//maxmag = max(maxmag,int(g.x));
+				atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1);
+				//atomicAdd(&wsm.gradient_histogram[0], 1);
+			}
+
+			//maxmag = int(float(warpMax(maxmag)) * 0.8f);
+
+			uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32));
+
+			// Apply threshold
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				float weight = calcWeight(x,y);
+				float thresh = gthresh; //max(gthresh, focal_thresh);
+				//for (int u=-3; u<=3; ++u) {
+				//	thresh = (x+u >= 0 && x+u < width) ? max(thresh, weight*float(magnitude(y,x+u))) : thresh;
+				//}
+
+				float m = float(magnitude(y,x)) * weight;
+				if (m < thresh) angle(y,x) = 0;
+				//output(y,x) = (m < thresh) ? 0 : 255;
+				if (m >= thresh) {
+					int a = angle(y,x);
+					buckets.add(y, make_short2(x, a));
+				}
+
+				angle(y,x) = uchar(thresh*weight);
+			}
+		}
+	}
+};
+
+/**
+ * Find salient gradient features and gather in orientation groups scanline
+ * buckets. Adds features to orientations either side of actual orientation for
+ * a degree of tolerance to exact orientation. This allows fast search for
+ * features based upon scanline and orientation.
+ */
+struct SalientGradientGrouped {
+	short2 focal_pt;
+	int radius;
+	Array2D<uchar>::Data image;
+	Array2D<uchar>::Data angle;
+	Array2D<uchar>::Data magnitude;
+	Bucket2D<ushort, 64>::Data buckets;
+
+	int width, height;
+
+	__cuda__ inline float2 calculateGradient(int x, int y) {
+		if (x < 1 || y < 1 || x >= width-1 || y >= height-1) return make_float2(0,0);
+
+		float dx = -1.0f*float(image(y-1,x-1)) + -2.0f*float(image(y, x-1)) + -1*float(image(y+1, x-1)) +
+             float(image(y-1, x+1)) + 2.0f*float(image(y, x+1)) + float(image(y+1, x+1));
+        float dy = float(image(y-1, x-1)) + 2.0f*float(image(y-1, x)) + float(image(y-1, x+1)) +
+             -1.0f*float(image(y+1, x-1)) + -2.0f*float(image(y+1, x)) + -1.0f*float(image(y+1, x+1));
+
+        float g = sqrt( (dx*dx) + (dy*dy) );
+		float a = atan2(dy, dx);
+		return make_float2(g,a);
+	}
+
+	struct WarpSharedMemory {
+		int gradient_histogram[32];
+	};
+
+	inline __device__ int scan(volatile int *s_Data, int thread, int threshold) {
+		for (uint offset = 1; offset < 32; offset <<= 1) {
+			__syncwarp();
+			uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
+			__syncwarp();
+			s_Data[thread] = t;
+		}
+
+		uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold);
+		return __ffs(t);
+	}
+
+	__device__ inline float weighting(float r, float h) {
+		if (r >= h) return 0.0f;
+		float rh = r / h;
+		rh = 1.0f - rh*rh;
+		return rh*rh*rh*rh;
+	}
+
+	__device__ inline float calcWeight(int x, int y) {
+		float dx = focal_pt.x - x;
+		float dy = focal_pt.y - y;
+		float dist = sqrt(dx*dx + dy*dy);
+		float weight = weighting(dist, float(radius));
+		return weight;
+	}
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) {
+		static constexpr float PI = 3.14f;
+		static constexpr float PI2 = PI/2.0f;
+
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			// Reset histogram
+			//for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0;
+			wsm.gradient_histogram[thread.x] = 0;
+			//int maxmag = 0;
+
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				auto g = calculateGradient(x,y);
+				angle(y,x) = uchar(min(15,int((g.y+PI2) / PI * 16.0f)));
+				magnitude(y,x) = uchar(g.x);
+
+				//maxmag = max(maxmag,int(g.x));
+				atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1);
+				//atomicAdd(&wsm.gradient_histogram[0], 1);
+			}
+
+			//maxmag = int(float(warpMax(maxmag)) * 0.8f);
+
+			uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32));
+
+			// Apply threshold
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				float weight = calcWeight(x,y);
+				float thresh = gthresh;
+				//for (int u=-3; u<=3; ++u) {
+				//	thresh = (x+u >= 0 && x+u < width) ? max(thresh, weight*float(magnitude(y,x+u))) : thresh;
+				//}
+
+				float m = float(magnitude(y,x))*weight;
+				//if (m < thresh) angle(y,x) = 0;
+				//output(y,x) = (m < thresh) ? 0 : 255;
+				if (m >= thresh) {
+					int a = angle(y,x);
+					buckets.add(y, a, ushort(x));
+					buckets.add(y, (a > 0) ? a-1 : 15, ushort(x));
+					buckets.add(y, (a < 15) ? a+1 : 0, ushort(x));
+				}
+			}
+		}
+	}
+};
+
+#endif
diff --git a/lib/libstereo/src/util.hpp b/lib/libstereo/src/util.hpp
index 4c50f082b7f44dffd85634e8e6b1101193542fca..92d9c3ee97ba3a714b2eb2adb42d83e5ffc3d6c8 100644
--- a/lib/libstereo/src/util.hpp
+++ b/lib/libstereo/src/util.hpp
@@ -21,6 +21,15 @@ __device__ inline T warpMin(T e) {
 	return e;
 }
 
+template <typename T>
+__device__ inline T warpMax(T e) {
+	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = max(e, other);
+	}
+	return e;
+}
+
 #ifdef USE_GPU
 
 template <typename FUNCTOR>
@@ -30,6 +39,16 @@ __global__ void kernel2D(FUNCTOR f, ushort2 size) {
 	f(thread, stride, size);
 }
 
+template <typename FUNCTOR, int WARPS>
+__global__ void kernel2DWarpSM(FUNCTOR f, ushort2 size) {
+	const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)};
+	const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)};
+
+	__shared__ typename FUNCTOR::WarpSharedMemory sm[WARPS];
+
+	f(thread, stride, size, sm[threadIdx.y]);
+}
+
 template <typename FUNCTOR>
 __global__ void kernel1D(FUNCTOR f, ushort size) {
 	const ushort thread = threadIdx.x+blockIdx.x*blockDim.x;
@@ -99,6 +118,27 @@ void parallel1DWarp(const FUNCTOR &f, int size, int size2) {
 #endif
 }
 
+template <typename FUNCTOR>
+void parallel1DWarpSM(const FUNCTOR &f, int size, int size2) {
+#if __cplusplus >= 201703L
+	//static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)");
+	// Is this static_assert correct?
+	static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)");
+#endif
+
+#ifdef USE_GPU
+	cudaSafeCall(cudaGetLastError());
+	const dim3 gridSize(1, (size+8-1) / 8);
+	const dim3 blockSize(32, 8);
+	ushort2 tsize{ushort(size2), ushort(size)};
+	kernel2DWarpSM<FUNCTOR, 8><<<gridSize, blockSize, 0, 0>>>(f,tsize);
+	cudaSafeCall(cudaGetLastError());
+	//cudaSafeCall(cudaDeviceSynchronize());
+#else
+
+#endif
+}
+
 /**
  * Wrapper to initiate a parallel processing job using a given functor. The
  * width and height parameters are used to determine the number of threads.