diff --git a/components/rgbd-sources/CMakeLists.txt b/components/rgbd-sources/CMakeLists.txt
index ff7d290981c1a84c5ee211219ae0651f85c58c77..3ac2e2de5f173a12bd7107d8d40b5d2eca3b4b5d 100644
--- a/components/rgbd-sources/CMakeLists.txt
+++ b/components/rgbd-sources/CMakeLists.txt
@@ -37,6 +37,7 @@ endif (LIBSGM_FOUND)
 if (CUDA_FOUND)
 	list(APPEND RGBDSRC
 		src/algorithms/disp2depth.cu
+		src/algorithms/offilter.cu
 #		"src/algorithms/opencv_cuda_bm.cpp"
 #		"src/algorithms/opencv_cuda_bp.cpp"
 #		"src/algorithms/rtcensus.cu"
diff --git a/components/rgbd-sources/include/ftl/offilter.hpp b/components/rgbd-sources/include/ftl/offilter.hpp
index 4c4fbbb9837ef39e80f9aad68c62ae5d82d4671e..c5e7a9cf7b02d11906043f8652ce615fe1ab5f52 100644
--- a/components/rgbd-sources/include/ftl/offilter.hpp
+++ b/components/rgbd-sources/include/ftl/offilter.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ftl/config.h>
+#include <ftl/rgbd/frame.hpp>
 
 #ifdef HAVE_OPTFLOW
 #include <opencv2/core.hpp>
@@ -12,23 +13,15 @@ namespace rgbd {
 
 class OFDisparityFilter {
 public:
-	OFDisparityFilter() : n_max_(0), threshold_(0.0), size_(0, 0) {} // TODO: invalid state
+	OFDisparityFilter() : n_max_(0), threshold_(0.0) {}
 	OFDisparityFilter(cv::Size size, int n_frames, float threshold);
-	void filter(cv::Mat &disp, const cv::Mat &rgb);
+	void filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream);
 
 private:
-	int n_;
 	int n_max_;
 	float threshold_;
-	cv::Size size_;
 
-	cv::Mat disp_;
-	cv::Mat gray_;
-
-	cv::Mat flowxy_;
-	cv::Mat flowxy_up_;
-
-	cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> nvof_;
+	cv::cuda::GpuMat disp_old_;
 };
 
 }
diff --git a/components/rgbd-sources/include/qsort.h b/components/rgbd-sources/include/qsort.h
new file mode 100644
index 0000000000000000000000000000000000000000..62a76b836c1b13861fc8a5a12ff0fc0eb7936f8a
--- /dev/null
+++ b/components/rgbd-sources/include/qsort.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2013, 2017 Alexey Tourbin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This is a traditional Quicksort implementation which mostly follows
+ * [Sedgewick 1978].  Sorting is performed entirely on array indices,
+ * while actual access to the array elements is abstracted out with the
+ * user-defined `LESS` and `SWAP` primitives.
+ *
+ * Synopsis:
+ *	QSORT(N, LESS, SWAP);
+ * where
+ *	N - the number of elements in A[];
+ *	LESS(i, j) - compares A[i] to A[j];
+ *	SWAP(i, j) - exchanges A[i] with A[j].
+ */
+
+#ifndef QSORT_H
+#define QSORT_H
+
+/* Sort 3 elements. */
+#define Q_SORT3(q_a1, q_a2, q_a3, Q_LESS, Q_SWAP) \
+do {					\
+    if (Q_LESS(q_a2, q_a1)) {		\
+	if (Q_LESS(q_a3, q_a2))		\
+	    Q_SWAP(q_a1, q_a3);		\
+	else {				\
+	    Q_SWAP(q_a1, q_a2);		\
+	    if (Q_LESS(q_a3, q_a2))	\
+		Q_SWAP(q_a2, q_a3);	\
+	}				\
+    }					\
+    else if (Q_LESS(q_a3, q_a2)) {	\
+	Q_SWAP(q_a2, q_a3);		\
+	if (Q_LESS(q_a2, q_a1))		\
+	    Q_SWAP(q_a1, q_a2);		\
+    }					\
+} while (0)
+
+/* Partition [q_l,q_r] around a pivot.  After partitioning,
+ * [q_l,q_j] are the elements that are less than or equal to the pivot,
+ * while [q_i,q_r] are the elements greater than or equal to the pivot. */
+#define Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP)		\
+do {									\
+    /* The middle element, not to be confused with the median. */	\
+    Q_UINT q_m = q_l + ((q_r - q_l) >> 1);				\
+    /* Reorder the second, the middle, and the last items.		\
+     * As [Edelkamp Weiss 2016] explain, using the second element	\
+     * instead of the first one helps avoid bad behaviour for		\
+     * decreasingly sorted arrays.  This method is used in recent	\
+     * versions of gcc's std::sort, see gcc bug 58437#c13, although	\
+     * the details are somewhat different (cf. #c14). */		\
+    Q_SORT3(q_l + 1, q_m, q_r, Q_LESS, Q_SWAP);				\
+    /* Place the median at the beginning. */				\
+    Q_SWAP(q_l, q_m);							\
+    /* Partition [q_l+2, q_r-1] around the median which is in q_l.	\
+     * q_i and q_j are initially off by one, they get decremented	\
+     * in the do-while loops. */					\
+    q_i = q_l + 1; q_j = q_r;						\
+    while (1) {								\
+	do q_i++; while (Q_LESS(q_i, q_l));				\
+	do q_j--; while (Q_LESS(q_l, q_j));				\
+	if (q_i >= q_j) break; /* Sedgewick says "until j < i" */	\
+	Q_SWAP(q_i, q_j);						\
+    }									\
+    /* Compensate for the i==j case. */					\
+    q_i = q_j + 1;							\
+    /* Put the median to its final place. */				\
+    Q_SWAP(q_l, q_j);							\
+    /* The median is not part of the left subfile. */			\
+    q_j--;								\
+} while (0)
+
+/* Insertion sort is applied to small subfiles - this is contrary to
+ * Sedgewick's suggestion to run a separate insertion sort pass after
+ * the partitioning is done.  The reason I don't like a separate pass
+ * is that it triggers extra comparisons, because it can't see that the
+ * medians are already in their final positions and need not be rechecked.
+ * Since I do not assume that comparisons are cheap, I also do not try
+ * to eliminate the (q_j > q_l) boundary check. */
+#define Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP)		\
+do {									\
+    Q_UINT q_i, q_j;							\
+    /* For each item starting with the second... */			\
+    for (q_i = q_l + 1; q_i <= q_r; q_i++)				\
+    /* move it down the array so that the first part is sorted. */	\
+    for (q_j = q_i; q_j > q_l && (Q_LESS(q_j, q_j - 1)); q_j--)		\
+	Q_SWAP(q_j, q_j - 1);						\
+} while (0)
+
+/* When the size of [q_l,q_r], i.e. q_r-q_l+1, is greater than or equal to
+ * Q_THRESH, the algorithm performs recursive partitioning.  When the size
+ * drops below Q_THRESH, the algorithm switches to insertion sort.
+ * The minimum valid value is probably 5 (with 5 items, the second and
+ * the middle items, the middle itself being rounded down, are distinct). */
+#define Q_THRESH 16
+
+/* The main loop. */
+#define Q_LOOP(Q_UINT, Q_N, Q_LESS, Q_SWAP)				\
+do {									\
+    Q_UINT q_l = 0;							\
+    Q_UINT q_r = (Q_N) - 1;						\
+    Q_UINT q_sp = 0; /* the number of frames pushed to the stack */	\
+    struct { Q_UINT q_l, q_r; }						\
+	/* On 32-bit platforms, to sort a "char[3GB+]" array,		\
+	 * it may take full 32 stack frames.  On 64-bit CPUs,		\
+	 * though, the address space is limited to 48 bits.		\
+	 * The usage is further reduced if Q_N has a 32-bit type. */	\
+	q_st[sizeof(Q_UINT) > 4 && sizeof(Q_N) > 4 ? 48 : 32];		\
+    while (1) {								\
+	if (q_r - q_l + 1 >= Q_THRESH) {				\
+	    Q_UINT q_i, q_j;						\
+	    Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP);	\
+	    /* Now have two subfiles: [q_l,q_j] and [q_i,q_r].		\
+	     * Dealing with them depends on which one is bigger. */	\
+	    if (q_j - q_l >= q_r - q_i)					\
+		Q_SUBFILES(q_l, q_j, q_i, q_r);				\
+	    else							\
+		Q_SUBFILES(q_i, q_r, q_l, q_j);				\
+	}								\
+	else {								\
+	    Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP);		\
+	    /* Pop subfiles from the stack, until it gets empty. */	\
+	    if (q_sp == 0) break;					\
+	    q_sp--;							\
+	    q_l = q_st[q_sp].q_l;					\
+	    q_r = q_st[q_sp].q_r;					\
+	}								\
+    }									\
+} while (0)
+
+/* The missing part: dealing with subfiles.
+ * Assumes that the first subfile is not smaller than the second. */
+#define Q_SUBFILES(q_l1, q_r1, q_l2, q_r2)				\
+do {									\
+    /* If the second subfile is only a single element, it needs		\
+     * no further processing.  The first subfile will be processed	\
+     * on the next iteration (both subfiles cannot be only a single	\
+     * element, due to Q_THRESH). */					\
+    if (q_l2 == q_r2) {							\
+	q_l = q_l1;							\
+	q_r = q_r1;							\
+    }									\
+    else {								\
+	/* Otherwise, both subfiles need processing.			\
+	 * Push the larger subfile onto the stack. */			\
+	q_st[q_sp].q_l = q_l1;						\
+	q_st[q_sp].q_r = q_r1;						\
+	q_sp++;								\
+	/* Process the smaller subfile on the next iteration. */	\
+	q_l = q_l2;							\
+	q_r = q_r2;							\
+    }									\
+} while (0)
+
+/* And now, ladies and gentlemen, may I proudly present to you... */
+#define QSORT(Q_N, Q_LESS, Q_SWAP)					\
+do {									\
+    if ((Q_N) > 1)							\
+	/* We could check sizeof(Q_N) and use "unsigned", but at least	\
+	 * on x86_64, this has the performance penalty of up to 5%. */	\
+	Q_LOOP(unsigned long, Q_N, Q_LESS, Q_SWAP);			\
+} while (0)
+
+#endif
+
+/* ex:set ts=8 sts=4 sw=4 noet: */
\ No newline at end of file
diff --git a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
index 782338fc2be41d56a4d6fcd4f4920f6d920e663f..de0ae29326cb8bd1d0e738e049abf40389b52040 100644
--- a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
+++ b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp
@@ -131,11 +131,7 @@ void FixstarsSGM::compute(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
 	dispt_scaled.convertTo(disp, CV_32F, 1.0f / 16.0f, stream);
 
 #ifdef HAVE_OPTFLOW
-	if (use_off_)
-	{
-		frame.getChannel<Mat>(ftl::rgbd::kChanDisparity);
-		off_.filter(frame.setChannel<Mat>(ftl::rgbd::kChanDisparity), Mat(lbw_));
-	}
+	if (use_off_) { off_.filter(frame, stream); }
 #endif
 }
 
diff --git a/components/rgbd-sources/src/algorithms/offilter.cu b/components/rgbd-sources/src/algorithms/offilter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6feee5ae1daf9fc8b4b165370dbb8646b1d7b8a1
--- /dev/null
+++ b/components/rgbd-sources/src/algorithms/offilter.cu
@@ -0,0 +1,91 @@
+#include <ftl/cuda_common.hpp>
+#include <ftl/rgbd/camera.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#include <qsort.h>
+
+__device__ void quicksort(float A[], size_t n)
+{
+	float tmp;
+	#define LESS(i, j) A[i] < A[j]
+	#define SWAP(i, j) tmp = A[i], A[i] = A[j], A[j] = tmp
+	QSORT(n, LESS, SWAP);
+}
+
+template<typename T>
+__device__  static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
+
+static const int max_history = 32; // TODO dynamic shared memory
+
+__global__ void temporal_median_filter_kernel(
+	cv::cuda::PtrStepSz<float> disp,
+	cv::cuda::PtrStepSz<int16_t> optflow,
+	cv::cuda::PtrStepSz<float> history,
+	int n_max,
+	int16_t threshold, // fixed point 10.5
+	float granularity  // 4 for Turing
+)
+{
+	float sorted[max_history]; // TODO: dynamic shared memory
+	for (STRIDE_Y(y, disp.rows)) {
+	for (STRIDE_X(x, disp.cols)) {
+
+		int flowx = optflow(round(y / granularity), 2 * round(x / granularity));
+		int flowy = optflow(round(y / granularity), 2 * round(x / granularity) + 1);
+
+		if ((abs(flowx) + abs(flowy)) > threshold)
+		{
+			// last element in history[x][y][t]
+			history(y, (x + 1) * n_max - 1) = 0.0;
+			return;
+		}
+
+		int count = history(y, (x + 1) * n_max - 1);
+		int n = count % (n_max - 1);
+
+		if (isValidDisparity(disp(y, x)))
+		{
+			history(y, (x + 1) * n_max - 1) += 1.0;
+			count++;
+			history(y, x * n_max + n) = disp(y, x);
+		}
+
+		int n_end = count;
+		if (n_end >= n_max)	{ n_end = n_max - 1; }
+
+		if (n_end != 0)
+		{
+			for (size_t i = 0; i < n_end; i++)
+			{
+				sorted[i] = history(y, x * n_max + i);
+			}
+
+			quicksort(sorted, n_end);
+			disp(y, x) = sorted[n_end / 2];
+		}
+	}}
+}
+
+namespace ftl {
+namespace cuda {
+	
+void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
+					cv::cuda::GpuMat &history, int n, float threshold,
+					cv::cuda::Stream &stream)
+{
+	dim3 grid(1, 1, 1);
+	dim3 threads(128, 1, 1);
+	grid.x = cv::cuda::device::divUp(disp.cols, 128);
+	grid.y = cv::cuda::device::divUp(disp.rows, 1);
+
+	// TODO: dynamic shared memory
+	temporal_median_filter_kernel<<<grid, threads, 0, cv::cuda::StreamAccessor::getStream(stream)>>>
+		(	disp, optflow, history, n,
+			round(threshold * (1 << 5)),	// TODO: documentation; 10.5 format
+			4								// TODO: (4 pixels granularity for Turing)
+		);
+
+	cudaSafeCall(cudaGetLastError());
+}
+
+}
+}
\ No newline at end of file
diff --git a/components/rgbd-sources/src/cuda_algorithms.hpp b/components/rgbd-sources/src/cuda_algorithms.hpp
index 0aa7399c0a24035bdbbb0442b3c429ddd03d145c..439c16cfc21fef08086bb69b7e84b1c8b49fec74 100644
--- a/components/rgbd-sources/src/cuda_algorithms.hpp
+++ b/components/rgbd-sources/src/cuda_algorithms.hpp
@@ -41,6 +41,9 @@ namespace cuda {
 	void disparity_to_depth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat &depth,
 				const ftl::rgbd::Camera &c, cv::cuda::Stream &stream);
 
+	void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
+						cv::cuda::GpuMat &history, int n_max, float threshold,
+						cv::cuda::Stream &stream);
 
 }
 }
diff --git a/components/rgbd-sources/src/offilter.cpp b/components/rgbd-sources/src/offilter.cpp
index 03db4807a7f9fa045708d3cf17bf32556b7bf5b5..8344540b9f0dc1cea1ff62e694d7ed774201a5f5 100644
--- a/components/rgbd-sources/src/offilter.cpp
+++ b/components/rgbd-sources/src/offilter.cpp
@@ -1,4 +1,5 @@
 #include "ftl/offilter.hpp"
+#include "cuda_algorithms.hpp"
 
 #ifdef HAVE_OPTFLOW
 
@@ -14,101 +15,26 @@ using std::vector;
 template<typename T> static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
 
 OFDisparityFilter::OFDisparityFilter(Size size, int n_frames, float threshold) :
-	n_(0), n_max_(n_frames), threshold_(threshold), size_(size)
+	n_max_(n_frames + 1), threshold_(threshold)
 {
+	CHECK((n_max_ > 1) && (n_max_ <= 32)) << "History length must be between 0 and 31!";
+	disp_old_ = cv::cuda::GpuMat(cv::Size(size.width * n_max_, size.height), CV_32FC1);
 	
-	disp_ = Mat::zeros(cv::Size(size.width * n_frames, size.height), CV_64FC1);
-	gray_ = Mat::zeros(size, CV_8UC1);
-
-	nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
+	/*nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
 													cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW,
-													true, false, false, 0);
+													true, false, false, 0);*/
 	
 }
 
-void OFDisparityFilter::filter(Mat &disp, const Mat &gray)
+void OFDisparityFilter::filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
 {
+	const cv::cuda::GpuMat &optflow = frame.getChannel<cv::cuda::GpuMat>(kChanFlow, stream);
+	frame.getChannel<cv::cuda::GpuMat>(kChanDisparity, stream);
+	stream.waitForCompletion();
+	if (optflow.empty()) { return; }
 
-	const int n = n_;
-	n_ = (n_ + 1) % n_max_;
-	
-	nvof_->calc(gray, gray_, flowxy_);
-	nvof_->upSampler(	flowxy_, size_.width, size_.height,
-						nvof_->getGridSize(), flowxy_up_);
-
-	CHECK(disp.type() == CV_32FC1);
-	CHECK(gray.type() == CV_8UC1);
-	CHECK(flowxy_up_.type() == CV_32FC2);
-
-	gray.copyTo(gray_);
-
-	vector<float> values(n_max_);
-
-	for (int y = 0; y < size_.height; y++)
-	{
-		float *D = disp_.ptr<float>(y);
-		float *d = disp.ptr<float>(y);
-		float *flow = flowxy_up_.ptr<float>(y);
-
-		for (int x = 0; x < size_.width; x++)
-		{
-			const float flow_l1 = abs(flow[2*x]) + abs(flow[2*x + 1]);
-
-			if (flow_l1 < threshold_)
-			{
-				values.clear();
-
-				if (isValidDisparity(d[x]))
-				{
-					bool updated = false;
-					for (int i = 0; i < n_max_; i++)
-					{
-						float &val = D[n_max_ * x + (n_max_ - i + n) % n_max_];
-						if (!isValidDisparity(val))
-						{
-							val = d[x];
-							updated = true;
-						}
-					}
-					if (!updated) { D[n_max_ * x + n] = d[x]; }
-				}
-
-				for (int i = 0; i < n_max_; i++)
-				{
-					float &val = D[n_max_ * x + i];
-					if (isValidDisparity(val)) { values.push_back(val); }
-				}
-
-				if (values.size() > 0) {
-					const auto median_it = values.begin() + values.size() / 2;
-					std::nth_element(values.begin(), median_it , values.end());
-					d[x] = *median_it;
-				}
-
-				/*
-				if (isValidDepth(d[x]) && isValidDepth(D[x]))
-				{
-					D[x] = D[x] * 0.66 + d[x] * (1.0 - 0.66);
-				}
-				if (isValidDepth(D[x]))
-				{
-					d[x] = D[x];
-				}
-				else
-				{
-					D[x] = d[x];
-				}
-				*/
-			}
-			else
-			{
-				for (int i = 0; i < n_max_; i++)
-				{
-					D[n_max_ * x + i] = 0.0;
-				}
-			}
-		}
-	}
+	cv::cuda::GpuMat &disp = frame.setChannel<cv::cuda::GpuMat>(kChanDisparity);
+	ftl::cuda::optflow_filter(disp, optflow, disp_old_, n_max_, threshold_, stream);
 }
 
 #endif  // HAVE_OPTFLOW
diff --git a/components/rgbd-sources/src/stereovideo.cpp b/components/rgbd-sources/src/stereovideo.cpp
index ebc72d854e8d35a480f3d4ef5873e8b58d5bb086..d718d1f7ed1f29bac101f639317671acf420a882 100644
--- a/components/rgbd-sources/src/stereovideo.cpp
+++ b/components/rgbd-sources/src/stereovideo.cpp
@@ -192,9 +192,9 @@ bool StereoVideoSource::retrieve() {
 
 		if (frames_[1].hasChannel(kChanLeftGray))
 		{
-			auto &left_gray_prev = frame.getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_);
+			auto &left_gray_prev = frames_[1].getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_);
 			auto &optflow = frame.setChannel<cv::cuda::GpuMat>(kChanFlow);
-			nvof_->calc(left_gray, left_gray_prev, optflow_, stream2_);
+			nvof_->calc(left_gray, left_gray_prev, optflow, stream2_);
 			// nvof_->upSampler() isn't implemented with CUDA
 			// cv::cuda::resize() does not work wiht 2-channel input
 			// cv::cuda::resize(optflow_, optflow, left.size(), 0.0, 0.0, cv::INTER_NEAREST, stream2_);