diff --git a/components/rgbd-sources/include/ftl/offilter.hpp b/components/rgbd-sources/include/ftl/offilter.hpp
index c63b4ca5486d7b6698d6a9933d762093999d344a..194ee20fb39bf759afc1ba2804d16c074c4fd32d 100644
--- a/components/rgbd-sources/include/ftl/offilter.hpp
+++ b/components/rgbd-sources/include/ftl/offilter.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ftl/config.h>
+#include <ftl/rgbd/frame.hpp>
 
 #ifdef HAVE_OPTFLOW
 #include <opencv2/core.hpp>
@@ -14,23 +15,14 @@ class OFDisparityFilter {
 public:
 	OFDisparityFilter() : n_max_(0), threshold_(0.0), size_(0, 0) {} // TODO: invalid state
 	OFDisparityFilter(cv::Size size, int n_frames, float threshold);
-	void filter(cv::Mat &disp, const cv::Mat &rgb);
-	void filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow);
+	void filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream);
 
 private:
 	int n_;
 	int n_max_;
 	float threshold_;
-	cv::Size size_;
 
-	cv::Mat disp_;
 	cv::cuda::GpuMat disp_old_;
-	cv::Mat gray_;
-
-	cv::Mat flowxy_;
-	cv::Mat flowxy_up_;
-
-	cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> nvof_;
 };
 
 }
diff --git a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
index 782338fc2be41d56a4d6fcd4f4920f6d920e663f..de0ae29326cb8bd1d0e738e049abf40389b52040 100644
--- a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
+++ b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
@@ -131,11 +131,7 @@ void FixstarsSGM::compute(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
 	dispt_scaled.convertTo(disp, CV_32F, 1.0f / 16.0f, stream);
 
 #ifdef HAVE_OPTFLOW
-	if (use_off_)
-	{
-		frame.getChannel<Mat>(ftl::rgbd::kChanDisparity);
-		off_.filter(frame.setChannel<Mat>(ftl::rgbd::kChanDisparity), Mat(lbw_));
-	}
+	if (use_off_) { off_.filter(frame, stream); }
 #endif
 }
 
diff --git a/components/rgbd-sources/src/algorithms/offilter.cu b/components/rgbd-sources/src/algorithms/offilter.cu
index 80e48663beb626aeaea8af4d0cabe2f8c9926bf0..546f94c1102c4bd6adb6a7d9e0ab96fb46787e26 100644
--- a/components/rgbd-sources/src/algorithms/offilter.cu
+++ b/components/rgbd-sources/src/algorithms/offilter.cu
@@ -11,39 +11,48 @@ __device__ void quicksort(float A[], size_t n)
 	QSORT(n, LESS, SWAP);
 }
 
-static const int max_history = 32;
-
 template<typename T>
 __device__  static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
 
+static const int max_history = 32; // TODO dynamic shared memory
+
 __global__ void temporal_median_filter_kernel(
 	cv::cuda::PtrStepSz<float> disp,
-	cv::cuda::PtrStepSz<int> optflow,
+	cv::cuda::PtrStepSz<int16_t> optflow,
 	cv::cuda::PtrStepSz<float> history,
-	int n_max, float threshold)
+	int n_max,
+	int16_t threshold // fixed point 10.5
+	uint granularity  // 4 for Turing
+)
 {
-	float sorted[max_history];
+	float sorted[max_history]; // TODO: dynamic shared memory
 	for (STRIDE_Y(y, disp.rows)) {
 	for (STRIDE_X(x, disp.cols)) {
 
 		int flowx = optflow(y, 2 * x);
 		int flowy = optflow(y, 2 * x + 1);
 
-		if ((abs(flowx) + abs(flowy)) > 32) // TODO float <-> fixed 10.5
+		int flowx = optflow(round(y / granularity), 2 * round(x / granularity));
+		int flowy = optflow(round(y/ granularity), 2 * round(x / granularity) + 1);
+
+		if ((abs(flowx) + abs(flowy)) > threshold)
 		{
+			// last element in history[x][y][t]
 			history(y, (x + 1) * n_max - 1) = 0.0;
 			return;
 		}
 
-		int n = history(y, (x + 1) * n_max - 1);
+		int count = history(y, (x + 1) * n_max - 1);
+		int n = count % (n_max - 1);
 
 		if (isValidDisparity(disp(y, x)))
 		{
 			history(y, (x + 1) * n_max - 1) += 1.0;
-			history(y, x * n_max + n % (n_max - 1)) = disp(y, x);
+			count++;
+			history(y, x * n_max + n) = disp(y, x);
 		}
 
-		int n_end = n;
+		int n_end = count;
 		if (n_end >= n_max)	{ n_end = n_max - 1; }
 
 		if (n_end != 0)
@@ -70,8 +79,13 @@ void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
 	dim3 threads(128, 1, 1);
 	grid.x = cv::cuda::device::divUp(disp.cols, 128);
 	grid.y = cv::cuda::device::divUp(disp.rows, 1);
+
+	// TODO: dynamic shared memory
 	temporal_median_filter_kernel<<<grid, threads, 0, cv::cuda::StreamAccessor::getStream(stream)>>>
-		(disp, optflow, history, n, threshold);
+		(	disp, optflow, history, n,
+			round(threshold * (1 << 5)) // TODO: documentation; 10.5 format
+			4,							// TODO: (4 pixels granularity for Turing)
+		);
 
 	cudaSafeCall(cudaGetLastError());
 }
diff --git a/components/rgbd-sources/src/offilter.cpp b/components/rgbd-sources/src/offilter.cpp
index bf1a5414dd5973452332e558ae375011cacbc56d..2fb6d162317d0d5563a8307061556e8e685562dc 100644
--- a/components/rgbd-sources/src/offilter.cpp
+++ b/components/rgbd-sources/src/offilter.cpp
@@ -1,7 +1,6 @@
 #include "ftl/offilter.hpp"
 #include "cuda_algorithms.hpp"
 
-
 #ifdef HAVE_OPTFLOW
 
 #include <loguru.hpp>
@@ -16,42 +15,25 @@ using std::vector;
 template<typename T> static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
 
 OFDisparityFilter::OFDisparityFilter(Size size, int n_frames, float threshold) :
-	n_(0), n_max_(n_frames), threshold_(threshold), size_(size)
+	n_(0), n_max_(n_frames + 1), threshold_(threshold)
 {
-	disp_ = Mat::zeros(cv::Size(size.width * n_frames, size.height), CV_64FC1);
-	disp_old_ = cv::cuda::GpuMat(cv::Size(size.width * n_frames, size.height), CV_32FC1);
-	gray_ = Mat::zeros(size, CV_8UC1);
+	CHECK((n_max_ > 1) && (n_max_ >= 32)) << "History length must be between 0 and 31!";
+	disp_old_ = cv::cuda::GpuMat(cv::Size(size.width * n_max_, size.height), CV_32FC1);
 	
-	nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
+	/*nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
 													cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW,
-													true, false, false, 0);
+													true, false, false, 0);*/
 	
 }
 
-void OFDisparityFilter::filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow)
+void OFDisparityFilter::filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
 {
-	ftl::cuda::optflow_filter(disp, optflow, disp_old_, n_max_, threshold_, cv::cuda::Stream::Null());
-	cv::cuda::Stream::Null().waitForCompletion();
-}
+	const cv::cuda::GpuMat &optflow = frame.getChannel<cv::cuda::GpuMat>(kChanFlow, stream);
+	frame.getChannel<cv::cuda::GpuMat>(kChanDisparity, stream);
+	stream.waitForCompletion();
+	cv::cuda::GpuMat &disp = frame.setChannel<cv::cuda::GpuMat>(kChanDisparity);
 
-void OFDisparityFilter::filter(Mat &disp, const Mat &gray)
-{
-	const int n = n_;
-	n_ = (n_ + 1) % n_max_;
-	nvof_->calc(gray, gray_, flowxy_);
-	nvof_->upSampler(	flowxy_, size_.width, size_.height,
-						nvof_->getGridSize(), flowxy_up_);
-
-	CHECK(disp.type() == CV_32FC1);
-	CHECK(gray.type() == CV_8UC1);
-	CHECK(flowxy_up_.type() == CV_32FC2);
-
-	gray.copyTo(gray_);
-
-	using cv::cuda::GpuMat;
-	GpuMat disp_gpu(disp);
-	filter(disp_gpu, GpuMat(flowxy_up_));
-	disp_gpu.download(disp);
+	ftl::cuda::optflow_filter(disp, optflow, disp_old_, n_max_, threshold_, stream);
 }
 
 #endif  // HAVE_OPTFLOW