diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp index c981c662c8ef2dae9729f239716a20e9f446f459..a2b9ef5b8c68ccc98e2a5ef29a19e8153180650b 100644 --- a/applications/gui/src/camera.cpp +++ b/applications/gui/src/camera.cpp @@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) { void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) { if (!isVirtual() && ((1 << fsid) & fsmask_)) { - channels_ = c; - if (c.has(Channel::Depth)) { + channels_ += c; + //if (c.has(Channel::Depth)) { //channels_ += Channel::ColourNormals; - } + //} } } @@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) { if ((size_t)fid_ >= fs->frames.size()) return; frame = &fs->frames[fid_]; + channels_ = frame->getChannels(); if (frame->hasChannel(Channel::Messages)) { msgs_.clear(); diff --git a/applications/gui/src/src_window.cpp b/applications/gui/src/src_window.cpp index fe3763252538ec1a2ac2597afdb97035c90c8eca..b3b71fabfd77187444df545c8fe6d389e0c954ad 100644 --- a/applications/gui/src/src_window.cpp +++ b/applications/gui/src/src_window.cpp @@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) { ftl::codecs::Channels<0> channels; if (fromstream) channels = cstream->available(fs.id); - if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels(); + //if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels(); cam.second.camera->update(fs.id, channels); } ++cycle_; diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp index ef11ba02bed6c21f915f6e7706e68952bb7335a5..59b50a1fef9b525089b2a7d8f4504fdc9e24cbcf 100644 --- a/components/operators/src/depth.cpp +++ b/components/operators/src/depth.cpp @@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); GpuMat &depth = in.get<GpuMat>(channel_); - ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_, - table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float), - radius_, edge_disc_, max_disc_, stream); + UNUSED(rgb); + UNUSED(depth); + + // FIXME: Not working right now + //ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_, + // table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float), + // radius_, edge_disc_, max_disc_, stream); //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); //filter_->apply(disp_in, rgb, disp_out, cvstream); @@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda rbuf_.resize(in.frames.size()); for (size_t i=0; i<in.frames.size(); ++i) { + if (!in.hasFrame(i)) continue; auto &f = in.frames[i]; if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) { _createPipeline(); diff --git a/components/operators/src/disparity/bilateral_filter.cpp b/components/operators/src/disparity/bilateral_filter.cpp index cc0285ecef06d3ea534aba5e6990c191cb04286d..694a23a9a0e9bab1b144ed9b279f4952dce022ff 100644 --- a/components/operators/src/disparity/bilateral_filter.cpp +++ b/components/operators/src/disparity/bilateral_filter.cpp @@ -3,6 +3,8 @@ #include "opencv/joint_bilateral.hpp" #include "cuda.hpp" +#include <opencv2/cudaimgproc.hpp> + using cv::cuda::GpuMat; using cv::Size; @@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) : scale_ = 16.0; n_disp_ = cfg->value("n_disp", 256); - radius_ = cfg->value("radius", 7); + radius_ = cfg->value("radius", 4); iter_ = cfg->value("iter", 13); filter_ = nullptr; } @@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_); + filter_->setNumIters(config()->value("iter", 13)); + auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity); GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity); - disp_out.create(disp_in.size(), disp_in.type()); + disp_int_.create(disp_in.size(), disp_in.type()); - disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); - filter_->apply(disp_int_, rgb, disp_int_result_, cvstream); - disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream); + //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); + //cv::cuda::cvtColor(rgb, bw_, cv::COLOR_BGRA2GRAY, 0, cvstream); + filter_->apply(disp_in, rgb, disp_int_, cvstream); + cv::cuda::swap(disp_out, disp_int_); + //disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream); return true; } \ No newline at end of file diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu index 1e655e2d6429a3c11279dfd4423c0bfdbdff92e1..e39afe6f60e236530985874dba9c0f598fe6a5e3 100644 --- a/components/operators/src/disparity/disp2depth.cu +++ b/components/operators/src/disparity/disp2depth.cu @@ -6,13 +6,13 @@ #define PINF __int_as_float(0x7f800000) #endif -__global__ void d2d_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, +__global__ void d2d_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth, ftl::rgbd::Camera cam) { for (STRIDE_Y(v,disp.rows)) { for (STRIDE_X(u,disp.cols)) { - float d = disp(v,u); - depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / (d - cam.doffs)); + short d = disp(v,u); + depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / ((float(d)/16.0f) - cam.doffs)); } } } @@ -34,14 +34,14 @@ namespace cuda { //============================================================================== -__global__ void d2drev_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, +__global__ void d2drev_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth, ftl::rgbd::Camera cam) { for (STRIDE_Y(v,disp.rows)) { for (STRIDE_X(u,disp.cols)) { float d = depth(v,u); float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs; - disp(v,u) = disparity; + disp(v,u) = short(disparity*16.0f); } } } diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp index ac81a56ce8501df6c856483e0ca3ebea87b68a83..d5bfe2d6af94234466c0a0a8f00e6b1667afd990 100644 --- a/components/operators/src/disparity/fixstars_sgm.cpp +++ b/components/operators/src/disparity/fixstars_sgm.cpp @@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { if (!init()) { return false; } } - auto &disp = out.create<GpuMat>(Channel::Disparity, Format<float>(l.size())); + auto &disp = out.create<GpuMat>(Channel::Disparity, Format<short>(l.size())); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream); @@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); // left_pixels.setTo(0); - cv::cuda::threshold(disp_int_, disp_int_, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); + cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); - disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream); + //disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream); return true; } diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp index 05b928e9c8c3e5fb5e544517bb07bbda3796a8d1..c93d81331c80f0b451330cda3a4f558cafc589aa 100644 --- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp +++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cpp @@ -167,7 +167,11 @@ namespace if (dst.data != disp.data) disp.copyTo(dst, stream); - disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream)); + if (img.channels() == 4) { + disp_bilateral_filter<T,uchar4>(disp, dst, img, iters, table_color.ptr<float>(), table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream)); + } else { + // TODO: If we need other channels... + } } void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream) @@ -184,7 +188,8 @@ namespace GpuMat img = _image.getGpuMat(); CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S ); - CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 ); + //CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 ); + CV_Assert( img.type() == CV_8UC4 ); // Nick: We only need/allow 4 channel CV_Assert( disp.size() == img.size() ); operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_, diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu index 7d2a90bacf865ef4d2010a8fdb6c97fa3f301c53..c1dd611c00e6830232a01d6f1eb86b643cd477cb 100644 --- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu +++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu @@ -47,15 +47,55 @@ #include "disparity_bilateral_filter.hpp" +#include <ftl/cuda_common.hpp> +#include <ftl/cuda/weighting.hpp> + using namespace cv::cuda::device; using namespace cv::cuda; using namespace cv; +#define WARP_SIZE 32 +#define FULL_MASK 0xFFFFFFFFu + +#define PIXELS_PER_LOOP 16 + namespace ftl { namespace cuda { namespace device { namespace disp_bilateral_filter { - template <int channels> + + template <typename C> + __device__ inline uchar distance(C a, C b); + + template <> + __device__ inline uchar distance(uchar4 a, uchar4 b) { + uchar x = ::abs(a.x - b.x); + uchar y = ::abs(a.y - b.y); + uchar z = ::abs(a.z - b.z); + return (::max(::max(x, y), z)); + /*union { + unsigned int v; + uchar d[4]; + }; + v = __vabsdiffs4(*(unsigned int*)&a, *(unsigned int*)&b); + return (::max(::max(d[0], d[1]), d[2]));*/ + } + + template <> + __device__ inline uchar distance(uchar3 a, uchar3 b) { + uchar x = ::abs(a.x - b.x); + uchar y = ::abs(a.y - b.y); + uchar z = ::abs(a.z - b.z); + return (::max(::max(x, y), z)); + } + + template <> + __device__ inline uchar distance(uchar a, uchar b) { + return abs(int(a)-int(b)); + } + + + /*template <int channels> struct DistRgbMax { static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) @@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device uchar z = ::abs(a[2] - b[2]); return (::max(::max(x, y), z)); } + }; + + template <> + struct DistRgbMax<4> + { + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) + { + const uchar4 aa = *(uchar4*)a; + const uchar4 bb = *(uchar4*)b; + uchar x = ::abs(aa.x - bb.x); + uchar y = ::abs(aa.y - bb.y); + uchar z = ::abs(aa.z - bb.z); + return (::max(::max(x, y), z)); + } }; template <> @@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device { return ::abs(a[0] - b[0]); } - }; + };*/ + + __device__ inline float calc_colour_weight(int d) { + return exp(-float(d * d) / (2.0f * 10.0f * 10.0f)); + } template <typename T> __device__ inline T Abs(T v) { return ::abs(v); } @@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device template <> __device__ inline float Abs<float>(float v) { return fabsf(v); } - template <int channels, typename T> - __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, - const uchar* img, size_t img_step, int h, int w, - const float* ctable_color, const float * ctable_space, size_t ctable_space_step, - int cradius, + template <typename C, int CRADIUS, typename T> + __global__ void disp_bilateral_filter(int t, const T* __restrict__ disp, T* __restrict__ dispout, size_t disp_step, + const C* __restrict__ img, size_t img_step, int h, int w, + const float* __restrict__ ctable_color, T cedge_disc, T cmax_disc) { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); + __shared__ float s_space[(CRADIUS+1)*(CRADIUS+1)]; + __shared__ short2 s_queue[4096]; // Depends on pixels per block + __shared__ int s_counter; + + // Create gaussian lookup for spatial weighting + for (int i=threadIdx.x+threadIdx.y*blockDim.x; i<(CRADIUS+1)*(CRADIUS+1); ++i) { + const int y = i / (CRADIUS+1); + const int x = i % (CRADIUS+1); + s_space[i] = exp(-sqrt(float(y * y) + float(x * x)) / float(CRADIUS+1)); + } + if (threadIdx.x == 0 && threadIdx.y == 0) s_counter = 0; + __syncthreads(); - T dp[5]; + // Check all pixels to see if they need processing + for (STRIDE_Y(y, h)) { + for (STRIDE_X(x, w)) { + bool todo_pixel = false; + if (y >= CRADIUS && y < h - CRADIUS && x >= CRADIUS && x < w - CRADIUS) { + T dp[5]; + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); + + *(dispout + y * disp_step + x) = dp[0]; - if (y > 0 && y < h - 1 && x > 0 && x < w - 1) - { - dp[0] = *(disp + (y ) * disp_step + x + 0); - dp[1] = *(disp + (y-1) * disp_step + x + 0); - dp[2] = *(disp + (y ) * disp_step + x - 1); - dp[3] = *(disp + (y+1) * disp_step + x + 0); - dp[4] = *(disp + (y ) * disp_step + x + 1); - - if(Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc) - { - const int ymin = ::max(0, y - cradius); - const int xmin = ::max(0, x - cradius); - const int ymax = ::min(h - 1, y + cradius); - const int xmax = ::min(w - 1, x + cradius); - - float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - - const uchar* ic = img + y * img_step + channels * x; - - for(int yi = ymin; yi <= ymax; yi++) - { - const T* disp_y = disp + yi * disp_step; - - for(int xi = xmin; xi <= xmax; xi++) - { - const uchar* in = img + yi * img_step + channels * xi; - - uchar dist_rgb = DistRgbMax<channels>::calc(in, ic); - - const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)]; - - const T disp_reg = disp_y[xi]; - - cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight; - cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight; - cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight; - cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight; - cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight; - } - } - - float minimum = numeric_limits<float>::max(); - int id = 0; - - if (cost[0] < minimum) - { - minimum = cost[0]; - id = 0; - } - if (cost[1] < minimum) - { - minimum = cost[1]; - id = 1; - } - if (cost[2] < minimum) - { - minimum = cost[2]; - id = 2; - } - if (cost[3] < minimum) - { - minimum = cost[3]; - id = 3; - } - if (cost[4] < minimum) - { - minimum = cost[4]; - id = 4; - } - - *(disp + y * disp_step + x) = dp[id]; + todo_pixel = (Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc); + } + + // Count valid pixels and warp and allocate space for them + const uint bal = __ballot_sync(0xFFFFFFFF, todo_pixel); + int index = 0; + if (threadIdx.x%32 == 0) { + index = atomicAdd(&s_counter, __popc(bal)); } + index = __shfl_sync(0xFFFFFFFF, index, 0, 32); + index += __popc(bal >> (threadIdx.x%32)) - 1; + if (todo_pixel) s_queue[index] = make_short2(x,y); + } } + + // Switch to processing mode + __syncthreads(); + + const int counter = s_counter; + + // Stride the queue to reduce bank conflicts + // Each thread takes a pixel that needs processing + for (int ix=(threadIdx.x + threadIdx.y*blockDim.x); ix<counter; ix+=(blockDim.x*blockDim.y)) { + const short2 pt = s_queue[ix]; + const int x = pt.x; + const int y = pt.y; + + T dp[5]; + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); + + float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + const C ic = *(img + y * img_step + x); + + //#pragma unroll + // Note: Don't unroll this one! + for(int yi = -CRADIUS; yi <= CRADIUS; ++yi) + { + const T* disp_y = disp + (y + yi) * disp_step; + + #pragma unroll + for(int xi = -CRADIUS; xi <= CRADIUS; ++xi) { + const C in = *(img + (y+yi) * img_step + (xi+x)); + + uchar dist_rgb = distance(ic,in); + + // The bilateral part of the filter + const float weight = ctable_color[dist_rgb] * s_space[::abs(yi)*(CRADIUS+1) + ::abs(xi)]; + + const T disp_reg = disp_y[x+xi]; + + // The "joint" part checking for depth similarity + cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight; + cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight; + cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight; + cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight; + cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight; + } + } + + float minimum = cost[0]; + int id = 0; + + if (cost[1] < minimum) + { + minimum = cost[1]; + id = 1; + } + if (cost[2] < minimum) + { + minimum = cost[2]; + id = 2; + } + if (cost[3] < minimum) + { + minimum = cost[3]; + id = 3; + } + if (cost[4] < minimum) + { + minimum = cost[4]; + id = 4; + } + + *(dispout + y * disp_step + x) = dp[id]; + } } - template <typename T> - void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream) + template <typename T, typename C> + void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *table_color, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream) { dim3 threads(32, 8, 1); dim3 grid(1, 1, 1); - grid.x = divUp(disp.cols, threads.x << 1); - grid.y = divUp(disp.rows, threads.y); + grid.x = (disp.cols + 64 - 1) / 64; // 64*64 = 4096, max pixels in block + grid.y = (disp.rows + 64 - 1) / 64; - switch (channels) - { - case 1: - for (int i = 0; i < iters; ++i) - { - disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); - - disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); - } - break; - case 3: - for (int i = 0; i < iters; ++i) - { - disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); - - disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); - } - break; - case 4: // Nick: Support 4 channel - for (int i = 0; i < iters; ++i) - { - disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); - - disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); - cudaSafeCall( cudaGetLastError() ); + T *in_ptr = disp.data; + T *out_ptr = dispout.data; + + // Iters must be odd. + if (iters & 0x1 == 0) iters += 1; + + switch (radius) { + case 1 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,1><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 2 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,2><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 3 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,3><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 4 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,4><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 5 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,5><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 6 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,6><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + case 7 : + for (int i = 0; i < iters; ++i) { + disp_bilateral_filter<C,7><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc); + cudaSafeCall( cudaGetLastError() ); + std::swap(in_ptr, out_ptr); + } break; + default: + CV_Error(cv::Error::BadTileSize, "Unsupported kernel radius"); } - break; - default: - CV_Error(cv::Error::BadNumChannels, "Unsupported channels count"); - } + if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } - template void disp_bilateral_filter<uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); - template void disp_bilateral_filter<short>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream); - template void disp_bilateral_filter<float>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, float, float, cudaStream_t stream); + // These are commented out since we don't use them and it slows compile + //template void disp_bilateral_filter<uchar,uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); + //template void disp_bilateral_filter<short,uchar>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream); + //template void disp_bilateral_filter<float,uchar>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream); + + //template void disp_bilateral_filter<uchar,uchar3>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); + //template void disp_bilateral_filter<short,uchar3>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream); + //template void disp_bilateral_filter<float,uchar3>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream); + + template void disp_bilateral_filter<uchar,uchar4>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); + template void disp_bilateral_filter<short,uchar4>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream); + template void disp_bilateral_filter<float,uchar4>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream); + } // namespace bilateral_filter }}} // namespace ftl { namespace cuda { namespace cudev diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp b/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp index b6c0d79fa3342dda8a6674d1290e07b9737c055a..80d9799ad2c51977efa2fddfcb8eba6b7c97aac4 100644 --- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp +++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.hpp @@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device { namespace disp_bilateral_filter { - template<typename T> - void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream); + template<typename T, typename C> + void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream); } }}}