diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp
index c981c662c8ef2dae9729f239716a20e9f446f459..a2b9ef5b8c68ccc98e2a5ef29a19e8153180650b 100644
--- a/applications/gui/src/camera.cpp
+++ b/applications/gui/src/camera.cpp
@@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) {
 
 void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) {
 	if (!isVirtual() && ((1 << fsid) & fsmask_)) {
-		channels_ = c;
-		if (c.has(Channel::Depth)) {
+		channels_ += c;
+		//if (c.has(Channel::Depth)) {
 			//channels_ += Channel::ColourNormals;
-		}
+		//}
 	}
 }
 
@@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) {
 
 			if ((size_t)fid_ >= fs->frames.size()) return;
 			frame = &fs->frames[fid_];
+			channels_ = frame->getChannels();
 
 			if (frame->hasChannel(Channel::Messages)) {
 				msgs_.clear();
diff --git a/applications/gui/src/src_window.cpp b/applications/gui/src/src_window.cpp
index fe3763252538ec1a2ac2597afdb97035c90c8eca..b3b71fabfd77187444df545c8fe6d389e0c954ad 100644
--- a/applications/gui/src/src_window.cpp
+++ b/applications/gui/src/src_window.cpp
@@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) {
 
 		ftl::codecs::Channels<0> channels;
 		if (fromstream) channels = cstream->available(fs.id);
-		if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels();
+		//if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels();
 		cam.second.camera->update(fs.id, channels);
 	}
 	++cycle_;
diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp
index ef11ba02bed6c21f915f6e7706e68952bb7335a5..59b50a1fef9b525089b2a7d8f4504fdc9e24cbcf 100644
--- a/components/operators/src/depth.cpp
+++ b/components/operators/src/depth.cpp
@@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out,
 	const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
 	GpuMat &depth = in.get<GpuMat>(channel_);
 
-	ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_,
-			table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float),
-			radius_, edge_disc_, max_disc_, stream);
+	UNUSED(rgb);
+	UNUSED(depth);
+
+	// FIXME: Not working right now
+	//ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_,
+	//		table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float),
+	//		radius_, edge_disc_, max_disc_, stream);
 
 	//disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
 	//filter_->apply(disp_in, rgb, disp_out, cvstream);
@@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 	rbuf_.resize(in.frames.size());
 
 	for (size_t i=0; i<in.frames.size(); ++i) {
+		if (!in.hasFrame(i)) continue;
 		auto &f = in.frames[i];
 		if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) {
 			_createPipeline();
diff --git a/components/operators/src/disparity/bilateral_filter.cpp b/components/operators/src/disparity/bilateral_filter.cpp
index cc0285ecef06d3ea534aba5e6990c191cb04286d..694a23a9a0e9bab1b144ed9b279f4952dce022ff 100644
--- a/components/operators/src/disparity/bilateral_filter.cpp
+++ b/components/operators/src/disparity/bilateral_filter.cpp
@@ -3,6 +3,8 @@
 #include "opencv/joint_bilateral.hpp"
 #include "cuda.hpp"
 
+#include <opencv2/cudaimgproc.hpp>
+
 using cv::cuda::GpuMat;
 using cv::Size;
 
@@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) :
 	
 	scale_ = 16.0;
 	n_disp_ = cfg->value("n_disp", 256);
-	radius_ = cfg->value("radius", 7);
+	radius_ = cfg->value("radius", 4);
 	iter_ = cfg->value("iter", 13);
 	filter_ = nullptr;
 }
@@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out
 
 	if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_);
 
+	filter_->setNumIters(config()->value("iter", 13));
+
 	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
 	const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
 	GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity);
 	GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity);
-	disp_out.create(disp_in.size(), disp_in.type());
+	disp_int_.create(disp_in.size(), disp_in.type());
 
-	disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
-	filter_->apply(disp_int_, rgb, disp_int_result_, cvstream);
-	disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream);
+	//disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
+	//cv::cuda::cvtColor(rgb, bw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
+	filter_->apply(disp_in, rgb, disp_int_, cvstream);
+	cv::cuda::swap(disp_out, disp_int_);
+	//disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream);
 	return true;
 }
\ No newline at end of file
diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu
index 1e655e2d6429a3c11279dfd4423c0bfdbdff92e1..e39afe6f60e236530985874dba9c0f598fe6a5e3 100644
--- a/components/operators/src/disparity/disp2depth.cu
+++ b/components/operators/src/disparity/disp2depth.cu
@@ -6,13 +6,13 @@
 #define PINF __int_as_float(0x7f800000)
 #endif
 
-__global__ void d2d_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth,
+__global__ void d2d_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
 		ftl::rgbd::Camera cam) {
 
 	for (STRIDE_Y(v,disp.rows)) {
 	for (STRIDE_X(u,disp.cols)) {
-		float d = disp(v,u);
-		depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / (d - cam.doffs));
+		short d = disp(v,u);
+		depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / ((float(d)/16.0f) - cam.doffs));
 	}
 	}
 }
@@ -34,14 +34,14 @@ namespace cuda {
 
 //==============================================================================
 
-__global__ void d2drev_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth,
+__global__ void d2drev_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
 	ftl::rgbd::Camera cam) {
 
 for (STRIDE_Y(v,disp.rows)) {
 for (STRIDE_X(u,disp.cols)) {
 	float d = depth(v,u);
 	float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs;
-	disp(v,u) = disparity;
+	disp(v,u) = short(disparity*16.0f);
 }
 }
 }
diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index ac81a56ce8501df6c856483e0ca3ebea87b68a83..d5bfe2d6af94234466c0a0a8f00e6b1667afd990 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 		if (!init()) { return false; }
 	}
 
-	auto &disp = out.create<GpuMat>(Channel::Disparity, Format<float>(l.size()));
+	auto &disp = out.create<GpuMat>(Channel::Disparity, Format<short>(l.size()));
 
 	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
 	cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
@@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 	// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
 	// left_pixels.setTo(0);
 
-	cv::cuda::threshold(disp_int_, disp_int_, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
+	cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
 	
-	disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream);
+	//disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream);
 	return true;
 }
diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp
index 05b928e9c8c3e5fb5e544517bb07bbda3796a8d1..c93d81331c80f0b451330cda3a4f558cafc589aa 100644
--- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp
+++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp
@@ -167,7 +167,11 @@ namespace
         if (dst.data != disp.data)
             disp.copyTo(dst, stream);
 
-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
+        if (img.channels() == 4) {
+            disp_bilateral_filter<T,uchar4>(disp, dst, img, iters, table_color.ptr<float>(), table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
+        } else {
+            // TODO: If we need other channels...
+        }
     }
 
     void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
@@ -184,7 +188,8 @@ namespace
         GpuMat img = _image.getGpuMat();
 
         CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
-        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 );
+        //CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 );
+        CV_Assert( img.type() == CV_8UC4 );  // Nick: We only need/allow 4 channel
         CV_Assert( disp.size() == img.size() );
 
         operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
index 7d2a90bacf865ef4d2010a8fdb6c97fa3f301c53..c1dd611c00e6830232a01d6f1eb86b643cd477cb 100644
--- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
+++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
@@ -47,15 +47,55 @@
 
 #include "disparity_bilateral_filter.hpp"
 
+#include <ftl/cuda_common.hpp>
+#include <ftl/cuda/weighting.hpp>
+
 using namespace cv::cuda::device;
 using namespace cv::cuda;
 using namespace cv;
 
+#define WARP_SIZE 32
+#define FULL_MASK 0xFFFFFFFFu
+
+#define PIXELS_PER_LOOP 16
+
 namespace ftl { namespace cuda { namespace device
 {
     namespace disp_bilateral_filter
     {
-        template <int channels>
+
+		template <typename C>
+		__device__ inline uchar distance(C a, C b);
+
+		template <>
+		__device__ inline uchar distance(uchar4 a, uchar4 b) {
+			uchar x = ::abs(a.x - b.x);
+            uchar y = ::abs(a.y - b.y);
+			uchar z = ::abs(a.z - b.z);
+			return (::max(::max(x, y), z));
+			/*union {
+				unsigned int v;
+				uchar d[4];
+			};
+			v = __vabsdiffs4(*(unsigned int*)&a, *(unsigned int*)&b);
+            return (::max(::max(d[0], d[1]), d[2]));*/
+		}
+
+		template <>
+		__device__ inline uchar distance(uchar3 a, uchar3 b) {
+			uchar x = ::abs(a.x - b.x);
+            uchar y = ::abs(a.y - b.y);
+            uchar z = ::abs(a.z - b.z);
+            return (::max(::max(x, y), z));
+		}
+
+		template <>
+		__device__ inline uchar distance(uchar a, uchar b) {
+			return abs(int(a)-int(b));
+		}
+
+
+        /*template <int channels>
         struct DistRgbMax
         {
             static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
@@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device
                 uchar z = ::abs(a[2] - b[2]);
                 return (::max(::max(x, y), z));
             }
+		};
+		
+		template <>
+        struct DistRgbMax<4>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+				const uchar4 aa = *(uchar4*)a;
+				const uchar4 bb = *(uchar4*)b;
+                uchar x = ::abs(aa.x - bb.x);
+                uchar y = ::abs(aa.y - bb.y);
+                uchar z = ::abs(aa.z - bb.z);
+                return (::max(::max(x, y), z));
+            }
         };
 
         template <>
@@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device
             {
                 return ::abs(a[0] - b[0]);
             }
-		};
+		};*/
+
+		__device__ inline float calc_colour_weight(int d) {
+			return exp(-float(d * d) / (2.0f * 10.0f * 10.0f));
+		}
 
 		template <typename T>
 		__device__ inline T Abs(T v) { return ::abs(v); }
@@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device
 		template <>
 		__device__ inline float Abs<float>(float v) { return fabsf(v); }
 
-        template <int channels, typename T>
-        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step,
-            const uchar* img, size_t img_step, int h, int w,
-            const float* ctable_color, const float * ctable_space, size_t ctable_space_step,
-            int cradius,
+        template <typename C, int CRADIUS, typename T>
+        __global__ void disp_bilateral_filter(int t, const T* __restrict__ disp, T* __restrict__ dispout, size_t disp_step,
+            const C* __restrict__ img, size_t img_step, int h, int w,
+            const float* __restrict__ ctable_color,
             T cedge_disc, T cmax_disc)
         {
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+            __shared__ float s_space[(CRADIUS+1)*(CRADIUS+1)];
+            __shared__ short2 s_queue[4096];  // Depends on pixels per block
+            __shared__ int s_counter;
+
+			// Create gaussian lookup for spatial weighting
+			for (int i=threadIdx.x+threadIdx.y*blockDim.x; i<(CRADIUS+1)*(CRADIUS+1); ++i) {
+				const int y = i / (CRADIUS+1);
+				const int x = i % (CRADIUS+1);
+				s_space[i] = exp(-sqrt(float(y * y) + float(x * x)) / float(CRADIUS+1));
+            }
+            if (threadIdx.x == 0 && threadIdx.y == 0) s_counter = 0;
+			__syncthreads();
 
-            T dp[5];
+            // Check all pixels to see if they need processing
+            for (STRIDE_Y(y, h)) {
+            for (STRIDE_X(x, w)) {
+                bool todo_pixel = false;
+                if (y >= CRADIUS && y < h - CRADIUS && x >= CRADIUS && x < w - CRADIUS) {
+                    T dp[5];
+                    dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                    dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                    dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                    dp[3] = *(disp + (y+1) * disp_step + x + 0);
+					dp[4] = *(disp + (y  ) * disp_step + x + 1);
+					
+					*(dispout + y * disp_step + x) = dp[0];
 
-            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-            {
-                dp[0] = *(disp + (y  ) * disp_step + x + 0);
-                dp[1] = *(disp + (y-1) * disp_step + x + 0);
-                dp[2] = *(disp + (y  ) * disp_step + x - 1);
-                dp[3] = *(disp + (y+1) * disp_step + x + 0);
-                dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-                if(Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc)
-                {
-                    const int ymin = ::max(0, y - cradius);
-                    const int xmin = ::max(0, x - cradius);
-                    const int ymax = ::min(h - 1, y + cradius);
-                    const int xmax = ::min(w - 1, x + cradius);
-
-                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                    const uchar* ic = img + y * img_step + channels * x;
-
-                    for(int yi = ymin; yi <= ymax; yi++)
-                    {
-                        const T* disp_y = disp + yi * disp_step;
-
-                        for(int xi = xmin; xi <= xmax; xi++)
-                        {
-                            const uchar* in = img + yi * img_step + channels * xi;
-
-                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
-
-                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
-
-                            const T disp_reg = disp_y[xi];
-
-                            cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
-                            cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
-                            cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
-                            cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
-                            cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
-                        }
-                    }
-
-                    float minimum = numeric_limits<float>::max();
-                    int id = 0;
-
-                    if (cost[0] < minimum)
-                    {
-                        minimum = cost[0];
-                        id = 0;
-                    }
-                    if (cost[1] < minimum)
-                    {
-                        minimum = cost[1];
-                        id = 1;
-                    }
-                    if (cost[2] < minimum)
-                    {
-                        minimum = cost[2];
-                        id = 2;
-                    }
-                    if (cost[3] < minimum)
-                    {
-                        minimum = cost[3];
-                        id = 3;
-                    }
-                    if (cost[4] < minimum)
-                    {
-                        minimum = cost[4];
-                        id = 4;
-                    }
-
-                    *(disp + y * disp_step + x) = dp[id];
+                    todo_pixel = (Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc);
+                }
+
+                // Count valid pixels and warp and allocate space for them
+                const uint bal = __ballot_sync(0xFFFFFFFF, todo_pixel);
+                int index = 0;
+                if (threadIdx.x%32 == 0) {
+                    index = atomicAdd(&s_counter, __popc(bal));
                 }
+                index = __shfl_sync(0xFFFFFFFF, index, 0, 32);
+				index += __popc(bal >> (threadIdx.x%32)) - 1;
+				if (todo_pixel) s_queue[index] = make_short2(x,y);
+            }
             }
+
+            // Switch to processing mode
+            __syncthreads();
+
+			const int counter = s_counter;
+
+			// Stride the queue to reduce bank conflicts
+			// Each thread takes a pixel that needs processing
+			for (int ix=(threadIdx.x + threadIdx.y*blockDim.x); ix<counter; ix+=(blockDim.x*blockDim.y)) {
+				const short2 pt = s_queue[ix];
+				const int x = pt.x;
+				const int y = pt.y;
+
+				T dp[5];
+				dp[0] = *(disp + (y  ) * disp_step + x + 0);
+				dp[1] = *(disp + (y-1) * disp_step + x + 0);
+				dp[2] = *(disp + (y  ) * disp_step + x - 1);
+				dp[3] = *(disp + (y+1) * disp_step + x + 0);
+				dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+				float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+				const C ic = *(img + y * img_step + x);
+
+				//#pragma unroll
+				// Note: Don't unroll this one!
+				for(int yi = -CRADIUS; yi <= CRADIUS; ++yi)
+				{
+					const T* disp_y = disp + (y + yi) * disp_step;
+
+					#pragma unroll
+					for(int xi = -CRADIUS; xi <= CRADIUS; ++xi) {
+						const C in = *(img + (y+yi) * img_step + (xi+x));
+
+						uchar dist_rgb = distance(ic,in);
+
+						// The bilateral part of the filter
+						const float weight = ctable_color[dist_rgb] * s_space[::abs(yi)*(CRADIUS+1) + ::abs(xi)];
+
+						const T disp_reg = disp_y[x+xi];
+
+						// The "joint" part checking for depth similarity
+						cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
+						cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
+						cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
+						cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
+						cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
+					}
+				}
+
+				float minimum = cost[0];
+				int id = 0;
+
+				if (cost[1] < minimum)
+				{
+					minimum = cost[1];
+					id = 1;
+				}
+				if (cost[2] < minimum)
+				{
+					minimum = cost[2];
+					id = 2;
+				}
+				if (cost[3] < minimum)
+				{
+					minimum = cost[3];
+					id = 3;
+				}
+				if (cost[4] < minimum)
+				{
+					minimum = cost[4];
+					id = 4;
+				}
+
+				*(dispout + y * disp_step + x) = dp[id];
+			}
         }
 
-        template <typename T>
-        void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream)
+        template <typename T, typename C>
+        void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *table_color, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream)
         {
             dim3 threads(32, 8, 1);
             dim3 grid(1, 1, 1);
-            grid.x = divUp(disp.cols, threads.x << 1);
-            grid.y = divUp(disp.rows, threads.y);
+			grid.x = (disp.cols + 64 - 1) / 64;  // 64*64 = 4096, max pixels in block
+			grid.y = (disp.rows + 64 - 1) / 64;
 
-            switch (channels)
-            {
-            case 1:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-                break;
-            case 3:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-				break;
-			case 4:  // Nick: Support 4 channel
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
-                    cudaSafeCall( cudaGetLastError() );
+            T *in_ptr = disp.data;
+            T *out_ptr = dispout.data;
+
+            // Iters must be odd.
+            if (iters & 0x1 == 0) iters += 1;
+
+            switch (radius) {
+                case 1  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,1><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 2  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,2><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 3  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,3><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 4  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,4><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 5  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,5><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 6  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,6><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                case 7  :
+                    for (int i = 0; i < iters; ++i) {
+                        disp_bilateral_filter<C,7><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
+                        cudaSafeCall( cudaGetLastError() );
+                        std::swap(in_ptr, out_ptr);
+                    } break;
+                default:
+                    CV_Error(cv::Error::BadTileSize, "Unsupported kernel radius");
                 }
-                break;
-            default:
-                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
+                
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template void disp_bilateral_filter<uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
-		template void disp_bilateral_filter<short>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream);
-		template void disp_bilateral_filter<float>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, float, float, cudaStream_t stream);
+        // These are commented out since we don't use them and it slows compile
+        //template void disp_bilateral_filter<uchar,uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
+		//template void disp_bilateral_filter<short,uchar>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
+        //template void disp_bilateral_filter<float,uchar>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
+
+        //template void disp_bilateral_filter<uchar,uchar3>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
+		//template void disp_bilateral_filter<short,uchar3>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
+        //template void disp_bilateral_filter<float,uchar3>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
+
+        template void disp_bilateral_filter<uchar,uchar4>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
+		template void disp_bilateral_filter<short,uchar4>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
+        template void disp_bilateral_filter<float,uchar4>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
+        
     } // namespace bilateral_filter
 }}} // namespace ftl { namespace cuda { namespace cudev
 
diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp b/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp
index b6c0d79fa3342dda8a6674d1290e07b9737c055a..80d9799ad2c51977efa2fddfcb8eba6b7c97aac4 100644
--- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp
+++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp
@@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device
 {
     namespace disp_bilateral_filter
     {
-        template<typename T>
-        void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream);
+        template<typename T, typename C>
+        void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream);
     }
 }}}