Skip to content
Snippets Groups Projects
Commit 8c315324 authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Merge branch 'exp/bilatperf' into 'master'

Improved bilateral filter performance

See merge request nicolas.pope/ftl!287
parents e76cc98b c931b499
Branches
Tags
1 merge request!287Improved bilateral filter performance
Pipeline #22842 passed
...@@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) { ...@@ -433,10 +433,10 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) {
void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) { void ftl::gui::Camera::update(int fsid, const ftl::codecs::Channels<0> &c) {
if (!isVirtual() && ((1 << fsid) & fsmask_)) { if (!isVirtual() && ((1 << fsid) & fsmask_)) {
channels_ = c; channels_ += c;
if (c.has(Channel::Depth)) { //if (c.has(Channel::Depth)) {
//channels_ += Channel::ColourNormals; //channels_ += Channel::ColourNormals;
} //}
} }
} }
...@@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) { ...@@ -469,6 +469,7 @@ void ftl::gui::Camera::update(std::vector<ftl::rgbd::FrameSet*> &fss) {
if ((size_t)fid_ >= fs->frames.size()) return; if ((size_t)fid_ >= fs->frames.size()) return;
frame = &fs->frames[fid_]; frame = &fs->frames[fid_];
channels_ = frame->getChannels();
if (frame->hasChannel(Channel::Messages)) { if (frame->hasChannel(Channel::Messages)) {
msgs_.clear(); msgs_.clear();
......
...@@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) { ...@@ -248,7 +248,7 @@ bool SourceWindow::_processFrameset(ftl::rgbd::FrameSet &fs, bool fromstream) {
ftl::codecs::Channels<0> channels; ftl::codecs::Channels<0> channels;
if (fromstream) channels = cstream->available(fs.id); if (fromstream) channels = cstream->available(fs.id);
if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels(); //if ((*framesets_[fs.id]).frames.size() > 0) channels += (*framesets_[fs.id]).frames[0].getChannels();
cam.second.camera->update(fs.id, channels); cam.second.camera->update(fs.id, channels);
} }
++cycle_; ++cycle_;
......
...@@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, ...@@ -107,9 +107,13 @@ bool DepthBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out,
const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
GpuMat &depth = in.get<GpuMat>(channel_); GpuMat &depth = in.get<GpuMat>(channel_);
ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_, UNUSED(rgb);
table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float), UNUSED(depth);
radius_, edge_disc_, max_disc_, stream);
// FIXME: Not working right now
//ftl::cuda::device::disp_bilateral_filter::disp_bilateral_filter<float>(depth, rgb, rgb.channels(), iter_,
// table_color_.ptr<float>(), (float *)table_space_.data, table_space_.step / sizeof(float),
// radius_, edge_disc_, max_disc_, stream);
//disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
//filter_->apply(disp_in, rgb, disp_out, cvstream); //filter_->apply(disp_in, rgb, disp_out, cvstream);
...@@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda ...@@ -160,6 +164,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
rbuf_.resize(in.frames.size()); rbuf_.resize(in.frames.size());
for (size_t i=0; i<in.frames.size(); ++i) { for (size_t i=0; i<in.frames.size(); ++i) {
if (!in.hasFrame(i)) continue;
auto &f = in.frames[i]; auto &f = in.frames[i];
if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) { if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) {
_createPipeline(); _createPipeline();
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "opencv/joint_bilateral.hpp" #include "opencv/joint_bilateral.hpp"
#include "cuda.hpp" #include "cuda.hpp"
#include <opencv2/cudaimgproc.hpp>
using cv::cuda::GpuMat; using cv::cuda::GpuMat;
using cv::Size; using cv::Size;
...@@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) : ...@@ -14,7 +16,7 @@ DisparityBilateralFilter::DisparityBilateralFilter(ftl::Configurable* cfg) :
scale_ = 16.0; scale_ = 16.0;
n_disp_ = cfg->value("n_disp", 256); n_disp_ = cfg->value("n_disp", 256);
radius_ = cfg->value("radius", 7); radius_ = cfg->value("radius", 4);
iter_ = cfg->value("iter", 13); iter_ = cfg->value("iter", 13);
filter_ = nullptr; filter_ = nullptr;
} }
...@@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out ...@@ -46,14 +48,18 @@ bool DisparityBilateralFilter::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out
if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_); if (!filter_) filter_ = ftl::cuda::createDisparityBilateralFilter(n_disp_ * scale_, radius_, iter_);
filter_->setNumIters(config()->value("iter", 13));
auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
const GpuMat &rgb = in.get<GpuMat>(Channel::Colour); const GpuMat &rgb = in.get<GpuMat>(Channel::Colour);
GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity); GpuMat &disp_in = in.get<GpuMat>(Channel::Disparity);
GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity); GpuMat &disp_out = out.create<GpuMat>(Channel::Disparity);
disp_out.create(disp_in.size(), disp_in.type()); disp_int_.create(disp_in.size(), disp_in.type());
disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream); //disp_in.convertTo(disp_int_, CV_16SC1, scale_, cvstream);
filter_->apply(disp_int_, rgb, disp_int_result_, cvstream); //cv::cuda::cvtColor(rgb, bw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream); filter_->apply(disp_in, rgb, disp_int_, cvstream);
cv::cuda::swap(disp_out, disp_int_);
//disp_int_result_.convertTo(disp_out, disp_in.type(), 1.0/scale_, cvstream);
return true; return true;
} }
\ No newline at end of file
...@@ -6,13 +6,13 @@ ...@@ -6,13 +6,13 @@
#define PINF __int_as_float(0x7f800000) #define PINF __int_as_float(0x7f800000)
#endif #endif
__global__ void d2d_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, __global__ void d2d_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
ftl::rgbd::Camera cam) { ftl::rgbd::Camera cam) {
for (STRIDE_Y(v,disp.rows)) { for (STRIDE_Y(v,disp.rows)) {
for (STRIDE_X(u,disp.cols)) { for (STRIDE_X(u,disp.cols)) {
float d = disp(v,u); short d = disp(v,u);
depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / (d - cam.doffs)); depth(v,u) = (d == 0) ? 0.0f : ((cam.baseline*cam.fx) / ((float(d)/16.0f) - cam.doffs));
} }
} }
} }
...@@ -34,14 +34,14 @@ namespace cuda { ...@@ -34,14 +34,14 @@ namespace cuda {
//============================================================================== //==============================================================================
__global__ void d2drev_kernel(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> depth, __global__ void d2drev_kernel(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<float> depth,
ftl::rgbd::Camera cam) { ftl::rgbd::Camera cam) {
for (STRIDE_Y(v,disp.rows)) { for (STRIDE_Y(v,disp.rows)) {
for (STRIDE_X(u,disp.cols)) { for (STRIDE_X(u,disp.cols)) {
float d = depth(v,u); float d = depth(v,u);
float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs; float disparity = (d > cam.maxDepth || d < cam.minDepth) ? 0.0f : ((cam.baseline*cam.fx) / d) + cam.doffs;
disp(v,u) = disparity; disp(v,u) = short(disparity*16.0f);
} }
} }
} }
......
...@@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { ...@@ -123,7 +123,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
if (!init()) { return false; } if (!init()) { return false; }
} }
auto &disp = out.create<GpuMat>(Channel::Disparity, Format<float>(l.size())); auto &disp = out.create<GpuMat>(Channel::Disparity, Format<short>(l.size()));
auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream); cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
...@@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { ...@@ -135,8 +135,8 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
// left_pixels.setTo(0); // left_pixels.setTo(0);
cv::cuda::threshold(disp_int_, disp_int_, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream); //disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream);
return true; return true;
} }
...@@ -167,7 +167,11 @@ namespace ...@@ -167,7 +167,11 @@ namespace
if (dst.data != disp.data) if (dst.data != disp.data)
disp.copyTo(dst, stream); disp.copyTo(dst, stream);
disp_bilateral_filter<T>(dst, img, img.channels(), iters, table_color.ptr<float>(), (float *)table_space.data, table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream)); if (img.channels() == 4) {
disp_bilateral_filter<T,uchar4>(disp, dst, img, iters, table_color.ptr<float>(), table_space_step, radius, edge_disc, max_disc, StreamAccessor::getStream(stream));
} else {
// TODO: If we need other channels...
}
} }
void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream) void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
...@@ -184,7 +188,8 @@ namespace ...@@ -184,7 +188,8 @@ namespace
GpuMat img = _image.getGpuMat(); GpuMat img = _image.getGpuMat();
CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S ); CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 ); //CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 || img.type() == CV_8UC4 );
CV_Assert( img.type() == CV_8UC4 ); // Nick: We only need/allow 4 channel
CV_Assert( disp.size() == img.size() ); CV_Assert( disp.size() == img.size() );
operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_, operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
......
...@@ -47,15 +47,55 @@ ...@@ -47,15 +47,55 @@
#include "disparity_bilateral_filter.hpp" #include "disparity_bilateral_filter.hpp"
#include <ftl/cuda_common.hpp>
#include <ftl/cuda/weighting.hpp>
using namespace cv::cuda::device; using namespace cv::cuda::device;
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv; using namespace cv;
#define WARP_SIZE 32
#define FULL_MASK 0xFFFFFFFFu
#define PIXELS_PER_LOOP 16
namespace ftl { namespace cuda { namespace device namespace ftl { namespace cuda { namespace device
{ {
namespace disp_bilateral_filter namespace disp_bilateral_filter
{ {
template <int channels>
template <typename C>
__device__ inline uchar distance(C a, C b);
template <>
__device__ inline uchar distance(uchar4 a, uchar4 b) {
uchar x = ::abs(a.x - b.x);
uchar y = ::abs(a.y - b.y);
uchar z = ::abs(a.z - b.z);
return (::max(::max(x, y), z));
/*union {
unsigned int v;
uchar d[4];
};
v = __vabsdiffs4(*(unsigned int*)&a, *(unsigned int*)&b);
return (::max(::max(d[0], d[1]), d[2]));*/
}
template <>
__device__ inline uchar distance(uchar3 a, uchar3 b) {
uchar x = ::abs(a.x - b.x);
uchar y = ::abs(a.y - b.y);
uchar z = ::abs(a.z - b.z);
return (::max(::max(x, y), z));
}
template <>
__device__ inline uchar distance(uchar a, uchar b) {
return abs(int(a)-int(b));
}
/*template <int channels>
struct DistRgbMax struct DistRgbMax
{ {
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
...@@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device ...@@ -66,6 +106,20 @@ namespace ftl { namespace cuda { namespace device
uchar z = ::abs(a[2] - b[2]); uchar z = ::abs(a[2] - b[2]);
return (::max(::max(x, y), z)); return (::max(::max(x, y), z));
} }
};
template <>
struct DistRgbMax<4>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
const uchar4 aa = *(uchar4*)a;
const uchar4 bb = *(uchar4*)b;
uchar x = ::abs(aa.x - bb.x);
uchar y = ::abs(aa.y - bb.y);
uchar z = ::abs(aa.z - bb.z);
return (::max(::max(x, y), z));
}
}; };
template <> template <>
...@@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device ...@@ -75,7 +129,11 @@ namespace ftl { namespace cuda { namespace device
{ {
return ::abs(a[0] - b[0]); return ::abs(a[0] - b[0]);
} }
}; };*/
__device__ inline float calc_colour_weight(int d) {
return exp(-float(d * d) / (2.0f * 10.0f * 10.0f));
}
template <typename T> template <typename T>
__device__ inline T Abs(T v) { return ::abs(v); } __device__ inline T Abs(T v) { return ::abs(v); }
...@@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device ...@@ -83,144 +141,210 @@ namespace ftl { namespace cuda { namespace device
template <> template <>
__device__ inline float Abs<float>(float v) { return fabsf(v); } __device__ inline float Abs<float>(float v) { return fabsf(v); }
template <int channels, typename T> template <typename C, int CRADIUS, typename T>
__global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, __global__ void disp_bilateral_filter(int t, const T* __restrict__ disp, T* __restrict__ dispout, size_t disp_step,
const uchar* img, size_t img_step, int h, int w, const C* __restrict__ img, size_t img_step, int h, int w,
const float* ctable_color, const float * ctable_space, size_t ctable_space_step, const float* __restrict__ ctable_color,
int cradius,
T cedge_disc, T cmax_disc) T cedge_disc, T cmax_disc)
{ {
const int y = blockIdx.y * blockDim.y + threadIdx.y; __shared__ float s_space[(CRADIUS+1)*(CRADIUS+1)];
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); __shared__ short2 s_queue[4096]; // Depends on pixels per block
__shared__ int s_counter;
// Create gaussian lookup for spatial weighting
for (int i=threadIdx.x+threadIdx.y*blockDim.x; i<(CRADIUS+1)*(CRADIUS+1); ++i) {
const int y = i / (CRADIUS+1);
const int x = i % (CRADIUS+1);
s_space[i] = exp(-sqrt(float(y * y) + float(x * x)) / float(CRADIUS+1));
}
if (threadIdx.x == 0 && threadIdx.y == 0) s_counter = 0;
__syncthreads();
T dp[5]; // Check all pixels to see if they need processing
for (STRIDE_Y(y, h)) {
for (STRIDE_X(x, w)) {
bool todo_pixel = false;
if (y >= CRADIUS && y < h - CRADIUS && x >= CRADIUS && x < w - CRADIUS) {
T dp[5];
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
*(dispout + y * disp_step + x) = dp[0];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1) todo_pixel = (Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc);
{ }
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0); // Count valid pixels and warp and allocate space for them
dp[2] = *(disp + (y ) * disp_step + x - 1); const uint bal = __ballot_sync(0xFFFFFFFF, todo_pixel);
dp[3] = *(disp + (y+1) * disp_step + x + 0); int index = 0;
dp[4] = *(disp + (y ) * disp_step + x + 1); if (threadIdx.x%32 == 0) {
index = atomicAdd(&s_counter, __popc(bal));
if(Abs(dp[1] - dp[0]) >= cedge_disc || Abs(dp[2] - dp[0]) >= cedge_disc || Abs(dp[3] - dp[0]) >= cedge_disc || Abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = ::max(0, y - cradius);
const int xmin = ::max(0, x - cradius);
const int ymax = ::min(h - 1, y + cradius);
const int xmax = ::min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
} }
index = __shfl_sync(0xFFFFFFFF, index, 0, 32);
index += __popc(bal >> (threadIdx.x%32)) - 1;
if (todo_pixel) s_queue[index] = make_short2(x,y);
}
} }
// Switch to processing mode
__syncthreads();
const int counter = s_counter;
// Stride the queue to reduce bank conflicts
// Each thread takes a pixel that needs processing
for (int ix=(threadIdx.x + threadIdx.y*blockDim.x); ix<counter; ix+=(blockDim.x*blockDim.y)) {
const short2 pt = s_queue[ix];
const int x = pt.x;
const int y = pt.y;
T dp[5];
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const C ic = *(img + y * img_step + x);
//#pragma unroll
// Note: Don't unroll this one!
for(int yi = -CRADIUS; yi <= CRADIUS; ++yi)
{
const T* disp_y = disp + (y + yi) * disp_step;
#pragma unroll
for(int xi = -CRADIUS; xi <= CRADIUS; ++xi) {
const C in = *(img + (y+yi) * img_step + (xi+x));
uchar dist_rgb = distance(ic,in);
// The bilateral part of the filter
const float weight = ctable_color[dist_rgb] * s_space[::abs(yi)*(CRADIUS+1) + ::abs(xi)];
const T disp_reg = disp_y[x+xi];
// The "joint" part checking for depth similarity
cost[0] += ::min(cmax_disc, Abs(disp_reg - dp[0])) * weight;
cost[1] += ::min(cmax_disc, Abs(disp_reg - dp[1])) * weight;
cost[2] += ::min(cmax_disc, Abs(disp_reg - dp[2])) * weight;
cost[3] += ::min(cmax_disc, Abs(disp_reg - dp[3])) * weight;
cost[4] += ::min(cmax_disc, Abs(disp_reg - dp[4])) * weight;
}
}
float minimum = cost[0];
int id = 0;
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(dispout + y * disp_step + x) = dp[id];
}
} }
template <typename T> template <typename T, typename C>
void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float* table_space, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream) void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *table_color, size_t table_step, int radius, T edge_disc, T max_disc, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1); grid.x = (disp.cols + 64 - 1) / 64; // 64*64 = 4096, max pixels in block
grid.y = divUp(disp.rows, threads.y); grid.y = (disp.rows + 64 - 1) / 64;
switch (channels) T *in_ptr = disp.data;
{ T *out_ptr = dispout.data;
case 1:
for (int i = 0; i < iters; ++i) // Iters must be odd.
{ if (iters & 0x1 == 0) iters += 1;
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() ); switch (radius) {
case 1 :
disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); for (int i = 0; i < iters; ++i) {
cudaSafeCall( cudaGetLastError() ); disp_bilateral_filter<C,1><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
} cudaSafeCall( cudaGetLastError() );
break; std::swap(in_ptr, out_ptr);
case 3: } break;
for (int i = 0; i < iters; ++i) case 2 :
{ for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); disp_bilateral_filter<C,2><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); } break;
cudaSafeCall( cudaGetLastError() ); case 3 :
} for (int i = 0; i < iters; ++i) {
break; disp_bilateral_filter<C,3><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
case 4: // Nick: Support 4 channel cudaSafeCall( cudaGetLastError() );
for (int i = 0; i < iters; ++i) std::swap(in_ptr, out_ptr);
{ } break;
disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); case 4 :
cudaSafeCall( cudaGetLastError() ); for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,4><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
disp_bilateral_filter<4><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols, table_color, table_space, table_step, radius, edge_disc, max_disc); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() ); std::swap(in_ptr, out_ptr);
} break;
case 5 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,5><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
case 6 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,6><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
case 7 :
for (int i = 0; i < iters; ++i) {
disp_bilateral_filter<C,7><<<grid, threads, 0, stream>>>(0, in_ptr, out_ptr, disp.step/sizeof(T), (C*)img.data, img.step/sizeof(C), disp.rows, disp.cols, table_color, edge_disc, max_disc);
cudaSafeCall( cudaGetLastError() );
std::swap(in_ptr, out_ptr);
} break;
default:
CV_Error(cv::Error::BadTileSize, "Unsupported kernel radius");
} }
break;
default:
CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
}
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void disp_bilateral_filter<uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, uchar, uchar, cudaStream_t stream); // These are commented out since we don't use them and it slows compile
template void disp_bilateral_filter<short>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, short, short, cudaStream_t stream); //template void disp_bilateral_filter<uchar,uchar>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
template void disp_bilateral_filter<float>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *table_color, const float *table_space, size_t table_step, int radius, float, float, cudaStream_t stream); //template void disp_bilateral_filter<short,uchar>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
//template void disp_bilateral_filter<float,uchar>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
//template void disp_bilateral_filter<uchar,uchar3>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
//template void disp_bilateral_filter<short,uchar3>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
//template void disp_bilateral_filter<float,uchar3>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar3> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
template void disp_bilateral_filter<uchar,uchar4>(cv::cuda::PtrStepSz<uchar> disp, cv::cuda::PtrStepSz<uchar> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, uchar, uchar, cudaStream_t stream);
template void disp_bilateral_filter<short,uchar4>(cv::cuda::PtrStepSz<short> disp, cv::cuda::PtrStepSz<short> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, short, short, cudaStream_t stream);
template void disp_bilateral_filter<float,uchar4>(cv::cuda::PtrStepSz<float> disp, cv::cuda::PtrStepSz<float> dispout, cv::cuda::PtrStepSz<uchar4> img, int iters, const float *table_color, size_t table_step, int radius, float, float, cudaStream_t stream);
} // namespace bilateral_filter } // namespace bilateral_filter
}}} // namespace ftl { namespace cuda { namespace cudev }}} // namespace ftl { namespace cuda { namespace cudev
......
...@@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device ...@@ -2,7 +2,7 @@ namespace ftl { namespace cuda { namespace device
{ {
namespace disp_bilateral_filter namespace disp_bilateral_filter
{ {
template<typename T> template<typename T, typename C>
void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSzb img, int channels, int iters, const float *, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream); void disp_bilateral_filter(cv::cuda::PtrStepSz<T> disp, cv::cuda::PtrStepSz<T> dispout, cv::cuda::PtrStepSz<C> img, int iters, const float *, size_t, int radius, T edge_disc, T max_disc, cudaStream_t stream);
} }
}}} }}}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment