Skip to content
Snippets Groups Projects
Commit c931b499 authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Improved bilateral filter performance

parent e76cc98b
No related branches found
No related tags found
No related merge requests found
...@@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) { ...@@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) {
void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) { void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) {
if (!isVirtual() && ((1 << fsid) & fsmask_)) { if (!isVirtual() && ((1 << fsid) & fsmask_)) {
channels_ = c; channels_ += c;
if (c.has(Channel::Depth)) { //if (c.has(Channel::Depth)) {
//channels_ += Channel::ColourNormals; //channels_ += Channel::ColourNormals;
} //}
} }
} }
...@@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) { ...@@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) {
if ((size_t)fid_ >= fs->frames.size()) return; if ((size_t)fid_ >= fs->frames.size()) return;
frame = &fs->frames[fid_]; frame = &fs->frames[fid_];
channels_ = frame->getChannels();
if (frame->hasChannel(Channel::Messages)) { if (frame->hasChannel(Channel::Messages)) {
msgs_.clear(); msgs_.clear();
......
...@@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) { ...@@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) {
ftl::codecs::Channels<0> channels; ftl::codecs::Channels<0> channels;
if (fromstream) channels = cstream->available(fs.id); if (fromstream) channels = cstream->available(fs.id);
if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels(); //if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels();
cam.second.camera->update(fs.id, channels); cam.second.camera->update(fs.id, channels);
} }
++cycle_; ++cycle_;
......
...@@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ...@@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out,
const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
GpuMat &depth = in.get<GpuMat>(channel_); GpuMat &depth = in.get<GpuMat>(channel_);
ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_, UNUSED(rgb);
table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float), UNUSED(depth);
radius_, edge_disc_, max_disc_, stream);
// FIXME: Not working right now
//ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_,
// table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float),
// radius_, edge_disc_, max_disc_, stream);
//disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
//filter_->apply(disp_in, rgb, disp_out, cvstream); //filter_->apply(disp_in, rgb, disp_out, cvstream);
...@@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda ...@@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
rbuf_.resize(in.frames.size()); rbuf_.resize(in.frames.size());
for (size_t i=0; i<in.frames.size(); ++i) { for (size_t i=0; i<in.frames.size(); ++i) {
if (!in.hasFrame(i)) continue;
auto &f = in.frames[i]; auto &f = in.frames[i];
if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) { if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) {
_createPipeline(); _createPipeline();
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "opencv/joint_bilateral.hpp" #include "opencv/joint_bilateral.hpp"
#include "cuda.hpp" #include "cuda.hpp"
#include <opencv2/cudaimgproc.hpp>
using cv::cuda::GpuMat; using cv::cuda::GpuMat;
using cv::Size; using cv::Size;
...@@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) : ...@@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) :
scale_ = 16.0; scale_ = 16.0;
n_disp_ = cfg->value("n_disp", 256); n_disp_ = cfg->value("n_disp", 256);
radius_ = cfg->value("radius", 7); radius_ = cfg->value("radius", 4);
iter_ = cfg->value("iter", 13); iter_ = cfg->value("iter", 13);
filter_ = nullptr; filter_ = nullptr;
} }
...@@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out ...@@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out
if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_); if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_);
filter_->setNumIters(config()->value("iter", 13));
auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity); GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity);
GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity); GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity);
disp_out.create(disp_in.size(), disp_in.type()); disp_int_.create(disp_in.size(), disp_in.type());
disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
filter_->apply(disp_int_, rgb, disp_int_result_, cvstream); //cv::cuda::cvtColor(rgb, bw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream); filter_->apply(disp_in, rgb, disp_int_, cvstream);
cv::cuda::swap(disp_out, disp_int_);
//disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream);
return true; return true;
} }
\ No newline at end of file
...@@ -6,13 +6,13 @@ ...@@ -6,13 +6,13 @@
#define PINF __int_as_float(0x7f800000) #define PINF __int_as_float(0x7f800000)
#endif #endif
__global__ void d2d_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, __global__ void d2d_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
ftl::rgbd::Camera cam) { ftl::rgbd::Camera cam) {
for (STRIDE_Y(v,disp.rows)) { for (STRIDE_Y(v,disp.rows)) {
for (STRIDE_X(u,disp.cols)) { for (STRIDE_X(u,disp.cols)) {
float d = disp(v,u); short d = disp(v,u);
depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / (d - cam.doffs)); depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / ((float(d)/16.0f) - cam.doffs));
} }
} }
} }
...@@ -34,14 +34,14 @@ namespace cuda { ...@@ -34,14 +34,14 @@ namespace cuda {
//============================================================================== //==============================================================================
__global__ void d2drev_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, __global__ void d2drev_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
ftl::rgbd::Camera cam) { ftl::rgbd::Camera cam) {
for (STRIDE_Y(v,disp.rows)) { for (STRIDE_Y(v,disp.rows)) {
for (STRIDE_X(u,disp.cols)) { for (STRIDE_X(u,disp.cols)) {
float d = depth(v,u); float d = depth(v,u);
float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs; float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs;
disp(v,u) = disparity; disp(v,u) = short(disparity*16.0f);
} }
} }
} }
......
...@@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { ...@@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
if (!init()) { return false; } if (!init()) { return false; }
} }
auto &disp = out.create<GpuMat>(Channel::Disparity, Format<float>(l.size())); auto &disp = out.create<GpuMat>(Channel::Disparity, Format<short>(l.size()));
auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream); cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
...@@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { ...@@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
// left_pixels.setTo(0); // left_pixels.setTo(0);
cv::cuda::threshold(disp_int_, disp_int_, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream); //disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream);
return true; return true;
} }
...@@ -167,7 +167,11 @@ namespace ...@@ -167,7 +167,11 @@ namespace
if (dst.data != disp.data) if (dst.data != disp.data)
disp.copyTo(dst, stream); disp.copyTo(dst, stream);
disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream)); if (img.channels() == 4) {
disp_bilateral_filter<T,uchar4>(disp, dst, img, iters, table_color.ptr<float>(), table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
} else {
// TODO: If we need other channels...
}
} }
void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream) void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
...@@ -184,7 +188,8 @@ namespace ...@@ -184,7 +188,8 @@ namespace
GpuMat img = _image.getGpuMat(); GpuMat img = _image.getGpuMat();
CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S ); CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 ); //CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 );
CV_Assert( img.type() == CV_8UC4 ); // Nick: We only need/allow 4 channel
CV_Assert( disp.size() == img.size() ); CV_Assert( disp.size() == img.size() );
operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_, operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
......
...@@ -47,15 +47,55 @@ ...@@ -47,15 +47,55 @@
#include "disparity_bilateral_filter.hpp" #include "disparity_bilateral_filter.hpp"
#include <ftl/cuda_common.hpp>
#include <ftl/cuda/weighting.hpp>
using namespace cv::cuda::device; using namespace cv::cuda::device;
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv; using namespace cv;
#define WARP_SIZE 32
#define FULL_MASK 0xFFFFFFFFu
#define PIXELS_PER_LOOP 16
namespace ftl { namespace cuda { namespace device namespace ftl { namespace cuda { namespace device
{ {
namespace disp_bilateral_filter namespace disp_bilateral_filter
{ {
template <int channels>
template <typename C>
__device__ inline uchar distance(C a, C b);
template <>
__device__ inline uchar distance(uchar4 a, uchar4 b) {
uchar x = ::abs(a.x - b.x);
uchar y = ::abs(a.y - b.y);
uchar z = ::abs(a.z - b.z);
return (::max(::max(x, y), z));
/*union {
unsigned int v;
uchar d[4];
};
v = __vabsdiffs4(*(unsigned int*)&a, *(unsigned int*)&b);
return (::max(::max(d[0], d[1]), d[2]));*/
}
template <>
__device__ inline uchar distance(uchar3 a, uchar3 b) {
uchar x = ::abs(a.x - b.x);
uchar y = ::abs(a.y - b.y);
uchar z = ::abs(a.z - b.z);
return (::max(::max(x, y), z));
}
template <>
__device__ inline uchar distance(uchar a, uchar b) {
return abs(int(a)-int(b));
}
/*template <int channels>
struct DistRgbMax struct DistRgbMax
{ {
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
...@@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device ...@@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device
uchar z = ::abs(a[2] - b[2]); uchar z = ::abs(a[2] - b[2]);
return (::max(::max(x, y), z)); return (::max(::max(x, y), z));
} }
};
template <>
struct DistRgbMax<4>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
const uchar4 aa = *(uchar4*)a;
const uchar4 bb = *(uchar4*)b;
uchar x = ::abs(aa.x - bb.x);
uchar y = ::abs(aa.y - bb.y);
uchar z = ::abs(aa.z - bb.z);
return (::max(::max(x, y), z));
}
}; };
template <> template <>
...@@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device ...@@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device
{ {
return ::abs(a[0] - b[0]); return ::abs(a[0] - b[0]);
} }
}; };*/
__device__ inline float calc_colour_weight(int d) {
return exp(-float(d * d) / (2.0f * 10.0f * 10.0f));
}
template <typename T> template <typename T>
__device__ inline T Abs(T v) { return ::abs(v); } __device__ inline T Abs(T v) { return ::abs(v); }
...@@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device ...@@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device
template <> template <>
__device__ inline float Abs<float>(float v) { return fabsf(v); } __device__ inline float Abs<float>(float v) { return fabsf(v); }
template <int channels, typename T> template <typename C, int CRADIUS, typename T>
__global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, __global__ void disp_bilateral_filter(int t, const T* __restrict__ disp, T* __restrict__ dispout, size_t disp_step,
const uchar* img, size_t img_step, int h, int w, const C* __restrict__ img, size_t img_step, int h, int w,
const float* ctable_color, const float * ctable_space, size_t ctable_space_step, const float* __restrict__ ctable_color,
int cradius,
T cedge_disc, T cmax_disc) T cedge_disc, T cmax_disc)
{ {
const int y = blockIdx.y * blockDim.y + threadIdx.y; __shared__ float s_space[(CRADIUS+1)*(CRADIUS+1)];
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); __shared__ short2 s_queue[4096]; // Depends on pixels per block
__shared__ int s_counter;
// Create gaussian lookup for spatial weighting
for (int i=threadIdx.x+threadIdx.y*blockDim.x; i<(CRADIUS+1)*(CRADIUS+1); ++i) {
const int y = i / (CRADIUS+1);
const int x = i % (CRADIUS+1);
s_space[i] = exp(-sqrt(float(y * y) + float(x * x)) / float(CRADIUS+1));
}
if (threadIdx.x == 0 && threadIdx.y == 0) s_counter = 0;
__syncthreads();
T dp[5]; // Check all pixels to see if they need processing
for (STRIDE_Y(y, h)) {
for (STRIDE_X(x, w)) {
bool todo_pixel = false;
if (y >= CRADIUS && y < h - CRADIUS && x >= CRADIUS && x < w - CRADIUS) {
T dp[5];
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
*(dispout + y * disp_step + x) = dp[0];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1) todo_pixel = (Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc);
{ }
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0); // Count valid pixels and warp and allocate space for them
dp[2] = *(disp + (y ) * disp_step + x - 1); const uint bal = __ballot_sync(0xFFFFFFFF, todo_pixel);
dp[3] = *(disp + (y+1) * disp_step + x + 0); int index = 0;
dp[4] = *(disp + (y ) * disp_step + x + 1); if (threadIdx.x%32 == 0) {
index = atomicAdd(&s_counter, __popc(bal));
if(Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = ::max(0, y - cradius);
const int xmin = ::max(0, x - cradius);
const int ymax = ::min(h - 1, y + cradius);
const int xmax = ::min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
} }
index = __shfl_sync(0xFFFFFFFF, index, 0, 32);
index += __popc(bal >> (threadIdx.x%32)) - 1;
if (todo_pixel) s_queue[index] = make_short2(x,y);
}
} }
// Switch to processing mode
__syncthreads();
const int counter = s_counter;
// Stride the queue to reduce bank conflicts
// Each thread takes a pixel that needs processing
for (int ix=(threadIdx.x + threadIdx.y*blockDim.x); ix<counter; ix+=(blockDim.x*blockDim.y)) {
const short2 pt = s_queue[ix];
const int x = pt.x;
const int y = pt.y;
T dp[5];
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const C ic = *(img + y * img_step + x);
//#pragma unroll
// Note: Don't unroll this one!
for(int yi = -CRADIUS; yi <= CRADIUS; ++yi)
{
const T* disp_y = disp + (y + yi) * disp_step;
#pragma unroll
for(int xi = -CRADIUS; xi <= CRADIUS; ++xi) {
const C in = *(img + (y+yi) * img_step + (xi+x));
uchar dist_rgb = distance(ic,in);
// The bilateral part of the filter
const float weight = ctable_color[dist_rgb] * s_space[::abs(yi)*(CRADIUS+1) + ::abs(xi)];
const T disp_reg = disp_y[x+xi];
// The "joint" part checking for depth similarity
cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
}
}
float minimum = cost[0];
int id = 0;
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(dispout + y * disp_step + x) = dp[id];
}
} }
template <typename T> template <typename T, typename C>
void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream) void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *table_color, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1); grid.x = (disp.cols + 64 - 1) / 64; // 64*64 = 4096, max pixels in block
grid.y = divUp(disp.rows, threads.y); grid.y = (disp.rows + 64 - 1) / 64;
switch (channels) T *in_ptr = disp.data;
{ T *out_ptr = dispout.data;
case 1:
for (int i = 0; i < iters; ++i) // Iters must be odd.
{ if (iters & 0x1 == 0) iters += 1;
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() ); switch (radius) {
case 1 :
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); for (int i = 0; i < iters; ++i) {
cudaSafeCall( cudaGetLastError() ); disp_bilateral_filter<C,1><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
} cudaSafeCall( cudaGetLastError() );
break; std::swap(in_ptr, out_ptr);
case 3: } break;
for (int i = 0; i < iters; ++i) case 2 :
{ for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); disp_bilateral_filter<C,2><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); } break;
cudaSafeCall( cudaGetLastError() ); case 3 :
} for (int i = 0; i < iters; ++i) {
break; disp_bilateral_filter<C,3><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
case 4: // Nick: Support 4 channel cudaSafeCall( cudaGetLastError() );
for (int i = 0; i < iters; ++i) std::swap(in_ptr, out_ptr);
{ } break;
disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); case 4 :
cudaSafeCall( cudaGetLastError() ); for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,4><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() ); std::swap(in_ptr, out_ptr);
} break;
case 5 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,5><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
case 6 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,6><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
case 7 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,7><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
default:
CV_Error(cv::Error::BadTileSize, "Unsupported kernel radius");
} }
break;
default:
CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void disp_bilateral_filter<uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); // These are commented out since we don't use them and it slows compile
template void disp_bilateral_filter<short>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream); //template void disp_bilateral_filter<uchar,uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
template void disp_bilateral_filter<float>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, float, float, cudaStream_t stream); //template void disp_bilateral_filter<short,uchar>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
//template void disp_bilateral_filter<float,uchar>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
//template void disp_bilateral_filter<uchar,uchar3>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
//template void disp_bilateral_filter<short,uchar3>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
//template void disp_bilateral_filter<float,uchar3>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
template void disp_bilateral_filter<uchar,uchar4>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
template void disp_bilateral_filter<short,uchar4>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
template void disp_bilateral_filter<float,uchar4>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
} // namespace bilateral_filter } // namespace bilateral_filter
}}} // namespace ftl { namespace cuda { namespace cudev }}} // namespace ftl { namespace cuda { namespace cudev
......
...@@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device ...@@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device
{ {
namespace disp_bilateral_filter namespace disp_bilateral_filter
{ {
template<typename T> template<typename T, typename C>
void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream); void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream);
} }
}}} }}}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment