diff --git a/lib/libstereo/src/algorithms/clustersf.cu b/lib/libstereo/src/algorithms/clustersf.cu
index 454a42b3cb3db9b5f579160b3255c2b638dee260..db488112343fb0c76b0d15af0487b266b1340586 100644
--- a/lib/libstereo/src/algorithms/clustersf.cu
+++ b/lib/libstereo/src/algorithms/clustersf.cu
@@ -33,10 +33,11 @@ void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disp
 	mat2gray(r, impl_->r);
 
 	SalientGradient sg = {impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->l.width, impl_->l.height};
-	parallel1DWarp(sg, l.rows(), l.cols());
+	parallel1DWarpSM(sg, l.rows(), l.cols());
 
 	cv::Mat tmp;
 	impl_->gl.toGpuMat().download(tmp);
+	cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2));
 	cv::imshow("Gradients", tmp);
 
 	cudaSafeCall(cudaDeviceSynchronize());
diff --git a/lib/libstereo/src/filters/salient_gradient.hpp b/lib/libstereo/src/filters/salient_gradient.hpp
index fd521d1742d886c2a26ce7552dd9473ea72e8d2d..093319518e75d51d5b220eb48709815be167199d 100644
--- a/lib/libstereo/src/filters/salient_gradient.hpp
+++ b/lib/libstereo/src/filters/salient_gradient.hpp
@@ -24,28 +24,55 @@ struct SalientGradient {
 		return make_float2(g,a);
 	}
 
-	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
+	struct WarpSharedMemory {
+		int gradient_histogram[32];
+	};
+
+	inline __device__ int scan(volatile int *s_Data, int thread, int threshold) {
+		for (uint offset = 1; offset < 32; offset <<= 1) {
+			__syncwarp();
+			uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
+			__syncwarp();
+			s_Data[thread] = t;
+		}
+
+		uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold);
+		return __ffs(t);
+	}
+
+	__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) {
 		static const float PI = 3.14f;
 		static constexpr float PI2 = PI/2.0f;
 
 		for (int y=thread.y; y<size.y; y+=stride.y) {
+			// Reset histogram
+			//for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0;
+			wsm.gradient_histogram[thread.x] = 0;
+
 			for (int x=thread.x; x<size.x; x+=stride.x) {
 				auto g = calculateGradient(x,y);
-				output(y,x) = uchar((g.y+PI2) / PI * 255.0f);
+				output(y,x) = uchar((g.y+PI2) / PI * 63.0f);
 				magnitude(y,x) = uchar(g.x);
+
+				//maxmag = warpMax(max(maxmag,int(g.x)));
+				atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1);
+				//atomicAdd(&wsm.gradient_histogram[0], 1);
 			}
 
 			__syncwarp();
 
+			uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32));
+
 			// Apply threshold
 			for (int x=thread.x; x<size.x; x+=stride.x) {
-				uchar thresh = 0;
-				for (int u=-15; u<=15; ++u) {
+				uchar thresh = gthresh;
+				for (int u=-5; u<=5; ++u) {
 					thresh = (x+u >= 0 && x+u < width) ? max(thresh, magnitude(y,x+u)-abs(u)) : thresh;
 				}
 
 				uchar m = magnitude(y,x);
-				if (m < thresh) output(y,x) = 0;
+				//if (m < max(thresh, uchar(float(maxmag)*0.2f))) output(y,x) = 0;
+				output(y,x) = (m < thresh) ? 0 : 255;
 			}
 
 			// Next step would be to bucket the results
diff --git a/lib/libstereo/src/util.hpp b/lib/libstereo/src/util.hpp
index 4c50f082b7f44dffd85634e8e6b1101193542fca..92d9c3ee97ba3a714b2eb2adb42d83e5ffc3d6c8 100644
--- a/lib/libstereo/src/util.hpp
+++ b/lib/libstereo/src/util.hpp
@@ -21,6 +21,15 @@ __device__ inline T warpMin(T e) {
 	return e;
 }
 
+template <typename T>
+__device__ inline T warpMax(T e) {
+	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
+		e = max(e, other);
+	}
+	return e;
+}
+
 #ifdef USE_GPU
 
 template <typename FUNCTOR>
@@ -30,6 +39,16 @@ __global__ void kernel2D(FUNCTOR f, ushort2 size) {
 	f(thread, stride, size);
 }
 
+template <typename FUNCTOR, int WARPS>
+__global__ void kernel2DWarpSM(FUNCTOR f, ushort2 size) {
+	const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)};
+	const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)};
+
+	__shared__ typename FUNCTOR::WarpSharedMemory sm[WARPS];
+
+	f(thread, stride, size, sm[threadIdx.y]);
+}
+
 template <typename FUNCTOR>
 __global__ void kernel1D(FUNCTOR f, ushort size) {
 	const ushort thread = threadIdx.x+blockIdx.x*blockDim.x;
@@ -99,6 +118,27 @@ void parallel1DWarp(const FUNCTOR &f, int size, int size2) {
 #endif
 }
 
+template <typename FUNCTOR>
+void parallel1DWarpSM(const FUNCTOR &f, int size, int size2) {
+#if __cplusplus >= 201703L
+	//static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)");
+	// Is this static_assert correct?
+	static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)");
+#endif
+
+#ifdef USE_GPU
+	cudaSafeCall(cudaGetLastError());
+	const dim3 gridSize(1, (size+8-1) / 8);
+	const dim3 blockSize(32, 8);
+	ushort2 tsize{ushort(size2), ushort(size)};
+	kernel2DWarpSM<FUNCTOR, 8><<<gridSize, blockSize, 0, 0>>>(f,tsize);
+	cudaSafeCall(cudaGetLastError());
+	//cudaSafeCall(cudaDeviceSynchronize());
+#else
+
+#endif
+}
+
 /**
  * Wrapper to initiate a parallel processing job using a given functor. The
  * width and height parameters are used to determine the number of threads.