diff --git a/lib/libstereo/src/algorithms/clustersf.cu b/lib/libstereo/src/algorithms/clustersf.cu
index cacbe1e40cbb7da52d90590cc249aef191971adb..273025d75485fdf9a535445762633de9e53c35b6 100644
--- a/lib/libstereo/src/algorithms/clustersf.cu
+++ b/lib/libstereo/src/algorithms/clustersf.cu
@@ -11,12 +11,13 @@ struct StereoCSF::Impl {
 	Array2D<uchar> gl;
 	Array2D<uchar> gr;
 	Array2D<uchar> temp;
-	Bucket2D<ushort, 64> buckets_l;
+	Bucket1D<short2, 64> buckets_l;
+	Bucket2D<ushort, 64> buckets_r;
 
 	Impl(int width, int height) :
 		l(width, height), r(width, height),
 		gl(width, height), gr(width, height), temp(width, height),
-		buckets_l(16, height) {}
+		buckets_l(height), buckets_r(16, height) {}
 };
 
 StereoCSF::StereoCSF() : impl_(nullptr) {
@@ -37,15 +38,15 @@ void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disp
 
 	SalientGradient sgl = {impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->buckets_l.data(), impl_->l.width, impl_->l.height};
 	parallel1DWarpSM(sgl, l.rows(), l.cols());
-	//SalientGradient sgr = {impl_->r.data(), impl_->gr.data(), impl_->temp.data(), impl_->r.width, impl_->r.height};
-	//parallel1DWarpSM(sgr, r.rows(), r.cols());
+	SalientGradientGrouped sgr = {impl_->r.data(), impl_->gr.data(), impl_->temp.data(), impl_->buckets_r.data(), impl_->r.width, impl_->r.height};
+	parallel1DWarpSM(sgr, r.rows(), r.cols());
 
 	cv::Mat tmp;
-	impl_->buckets_l.toGpuMat().download(tmp);
+	impl_->buckets_r.toGpuMat().download(tmp);
 	tmp.convertTo(tmp, CV_8UC1, 4.0);
 	cv::resize(tmp,tmp, cv::Size(tmp.cols*40, tmp.rows/2));
 	cv::applyColorMap(tmp, tmp, cv::COLORMAP_TURBO);
-	cv::imshow("Gradients Left", tmp);
+	cv::imshow("Gradients Right", tmp);
 
 	//impl_->gr.toGpuMat().download(tmp);
 	//cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2));
@@ -53,6 +54,7 @@ void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disp
 
 	cudaSafeCall(cudaDeviceSynchronize());
 	disparity.create(l.rows(), l.cols(), CV_32F);
+	cv::waitKey(10000);
 }
 
 StereoCSF::~StereoCSF() {
diff --git a/lib/libstereo/src/bucket1d.hpp b/lib/libstereo/src/bucket1d.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..77f8ff9621112c0adbfb19eb1fbb093ac4e768c8
--- /dev/null
+++ b/lib/libstereo/src/bucket1d.hpp
@@ -0,0 +1,130 @@
+#ifndef _FTL_LIBSTEREO_BUCKET1D_HPP_
+#define _FTL_LIBSTEREO_BUCKET1D_HPP_
+
+#include "memory.hpp"
+
+template<typename T, int DEPTH>
+class Bucket1D {
+public:
+	Bucket1D() : width(0), needs_free_(false) {
+		data_.buckets = nullptr;
+		data_.counters = nullptr;
+	}
+
+	Bucket1D(int w) : width(w), needs_free_(true) {
+		data_.counters = allocateMemory<uint>(w);
+		data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch);
+	}
+
+	~Bucket1D() {
+		free();
+	}
+
+	void free() {
+		if (needs_free_ && data_.buckets) {
+			freeMemory(data_.counters);
+			freeMemory(data_.buckets);
+		}
+	}
+
+	Bucket1D<T, DEPTH> &operator=(const Bucket1D<T, DEPTH> &c) {
+		data_ = c.data_;
+		width = c.width;
+		needs_free_ = false;
+		return *this;
+	}
+
+	struct Data {
+		__host__ __device__ inline uint& operator() (const int y) {
+			return counters[y];
+		}
+
+		__host__ __device__ inline const uint& operator() (const int y) const {
+			return counters[y];
+		}
+
+		__host__ __device__ inline const T& operator() (const int y, int d) const {
+			return buckets[d + y*bucket_pitch];
+		}
+
+		__host__ __device__ inline void add (const int y, T v) {
+			uint ix = atomicInc(&counters[y], DEPTH);
+			buckets[ix + y*bucket_pitch] = v;
+		}
+
+		__host__ __device__ inline const T *ptr(const int y) const { return &buckets[y*bucket_pitch]; }
+
+		uint bucket_pitch;
+		T *buckets;
+		uint *counters;
+	};
+
+	void create(int w) {
+		if (w == width) return;
+		width = w;
+		free();
+		needs_free_ = true;
+		data_.counters = allocateMemory<uint>(w);
+		data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch);
+	}
+
+	inline Data &data() { return data_; }
+	inline const Data &data() const { return data_; }
+
+	void toMat(cv::Mat &m) {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		gm.download(m);
+		#else
+		m = cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	cv::Mat toMat() {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm;
+		toGpuMat(gm);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	const cv::Mat toMat() const {
+		#ifdef USE_GPU
+		cv::cuda::GpuMat gm(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		cv::Mat m;
+		gm.download(m);
+		return m;
+		#else
+		return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters);
+		#endif
+	}
+
+	void toGpuMat(cv::cuda::GpuMat &m) {
+		#ifdef USE_GPU
+		m = cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		#else
+		// TODO
+		#endif
+	}
+
+	cv::cuda::GpuMat toGpuMat() {
+		#ifdef USE_GPU
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters);
+		#else
+		return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value);
+		#endif
+	}
+
+	int width;
+
+private:
+	Data data_;
+	bool needs_free_;
+};
+
+#endif
diff --git a/lib/libstereo/src/filters/focal_cluster.hpp b/lib/libstereo/src/filters/focal_cluster.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6aec3128a2f8db6ef94158fd9bfbba9219d5ad06
--- /dev/null
+++ b/lib/libstereo/src/filters/focal_cluster.hpp
@@ -0,0 +1,23 @@
+#ifndef _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_
+#define _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_
+
+struct FocalCluster {
+	short2 focal_pt;
+	Bucket2D<ushort, 64>::Data left;
+	Bucket2D<ushort, 64>::Data right;
+	Array2D<float>::Data histogram;
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			for (int f=thread.x; f<)
+			// For each feature or features near to focal point
+
+			
+			// - Calc distance to focal in X
+			// - For each feature in right image that matches
+			//   - Add focal dist to feature X and add to histogram
+		}
+	}
+};
+
+#endif
diff --git a/lib/libstereo/src/filters/salient_gradient.hpp b/lib/libstereo/src/filters/salient_gradient.hpp
index 872e8c9a658b4511348952e4028a857180aff3e6..c473de6217a7f3e601b9734b9173fd5f4f2c401e 100644
--- a/lib/libstereo/src/filters/salient_gradient.hpp
+++ b/lib/libstereo/src/filters/salient_gradient.hpp
@@ -4,8 +4,99 @@
 #include "../util.hpp"
 #include "../array2d.hpp"
 #include "../bucket2d.hpp"
+#include "../bucket1d.hpp"
 
+/**
+ * Select salient gradient features and gather into scanline buckets that
+ * includes the x-coordinate and feature orientation. Not grouped by orientation.
+ */
 struct SalientGradient {
+	Array2D<uchar>::Data image;
+	Array2D<uchar>::Data angle;
+	Array2D<uchar>::Data magnitude;
+	Bucket1D<short2, 64>::Data buckets;
+
+	int width, height;
+
+	__cuda__ inline float2 calculateGradient(int x, int y) {
+		if (x < 1 || y < 1 || x >= width-1 || y >= height-1) return make_float2(0,0);
+
+		float dx = -1.0f*float(image(y-1,x-1)) + -2.0f*float(image(y, x-1)) + -1*float(image(y+1, x-1)) +
+             float(image(y-1, x+1)) + 2.0f*float(image(y, x+1)) + float(image(y+1, x+1));
+        float dy = float(image(y-1, x-1)) + 2.0f*float(image(y-1, x)) + float(image(y-1, x+1)) +
+             -1.0f*float(image(y+1, x-1)) + -2.0f*float(image(y+1, x)) + -1.0f*float(image(y+1, x+1));
+
+        float g = sqrt( (dx*dx) + (dy*dy) );
+		float a = atan2(dy, dx);
+		return make_float2(g,a);
+	}
+
+	struct WarpSharedMemory {
+		int gradient_histogram[32];
+	};
+
+	inline __device__ int scan(volatile int *s_Data, int thread, int threshold) {
+		for (uint offset = 1; offset < 32; offset <<= 1) {
+			__syncwarp();
+			uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
+			__syncwarp();
+			s_Data[thread] = t;
+		}
+
+		uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold);
+		return __ffs(t);
+	}
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) {
+		static const float PI = 3.14f;
+		static constexpr float PI2 = PI/2.0f;
+
+		for (int y=thread.y; y<size.y; y+=stride.y) {
+			// Reset histogram
+			//for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0;
+			wsm.gradient_histogram[thread.x] = 0;
+			//int maxmag = 0;
+
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				auto g = calculateGradient(x,y);
+				angle(y,x) = uchar(min(15,int((g.y+PI2) / PI * 16.0f)));
+				magnitude(y,x) = uchar(g.x);
+
+				//maxmag = max(maxmag,int(g.x));
+				atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1);
+				//atomicAdd(&wsm.gradient_histogram[0], 1);
+			}
+
+			//maxmag = int(float(warpMax(maxmag)) * 0.8f);
+
+			uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32));
+
+			// Apply threshold
+			for (int x=thread.x; x<size.x; x+=stride.x) {
+				uchar thresh = gthresh;
+				for (int u=-3; u<=3; ++u) {
+					thresh = (x+u >= 0 && x+u < width) ? max(thresh, magnitude(y,x+u)) : thresh;
+				}
+
+				uchar m = magnitude(y,x);
+				//if (m < thresh) angle(y,x) = 0;
+				//output(y,x) = (m < thresh) ? 0 : 255;
+				if (m >= thresh) {
+					int a = angle(y,x);
+					buckets.add(y, make_short2(x, a));
+				}
+			}
+		}
+	}
+};
+
+/**
+ * Find salient gradient features and gather in orientation groups scanline
+ * buckets. Adds features to orientations either side of actual orientation for
+ * a degree of tolerance to exact orientation. This allows fast search for
+ * features based upon scanline and orientation.
+ */
+struct SalientGradientGrouped {
 	Array2D<uchar>::Data image;
 	Array2D<uchar>::Data angle;
 	Array2D<uchar>::Data magnitude;
@@ -83,8 +174,6 @@ struct SalientGradient {
 					buckets.add(y, (a < 15) ? a+1 : 0, ushort(x));
 				}
 			}
-
-			// Next step would be to bucket the results
 		}
 	}
 };