From f5bb48a5f661466612c3c3207f416b0de5d9f611 Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nicolas.pope@utu.fi> Date: Wed, 25 Mar 2020 09:12:43 +0200 Subject: [PATCH] Implements #331 adaptive SGM penalties --- .../include/ftl/operators/cuda/disparity.hpp | 10 ++ .../include/ftl/operators/disparity.hpp | 16 ++- components/operators/src/depth.cpp | 8 +- .../operators/src/disparity/disp2depth.cu | 104 ++++++++++++++++++ .../operators/src/disparity/fixstars_sgm.cpp | 60 +++++++++- lib/libsgm/include/libsgm.h | 5 +- lib/libsgm/src/check_consistency.cu | 12 +- lib/libsgm/src/horizontal_path_aggregation.cu | 38 ++++--- .../src/horizontal_path_aggregation.hpp | 6 +- lib/libsgm/src/internal.h | 8 +- lib/libsgm/src/median_filter.cu | 12 +- lib/libsgm/src/oblique_path_aggregation.cu | 65 +++++++---- lib/libsgm/src/oblique_path_aggregation.hpp | 12 +- lib/libsgm/src/path_aggregation.cu | 19 ++-- lib/libsgm/src/path_aggregation.hpp | 3 +- lib/libsgm/src/sgm.cu | 15 ++- lib/libsgm/src/sgm.hpp | 7 +- lib/libsgm/src/stereo_sgm.cpp | 20 ++-- lib/libsgm/src/vertical_path_aggregation.cu | 37 ++++--- lib/libsgm/src/vertical_path_aggregation.hpp | 6 +- 20 files changed, 343 insertions(+), 120 deletions(-) diff --git a/components/operators/include/ftl/operators/cuda/disparity.hpp b/components/operators/include/ftl/operators/cuda/disparity.hpp index de4648f6f..75034d9e1 100644 --- a/components/operators/include/ftl/operators/cuda/disparity.hpp +++ b/components/operators/include/ftl/operators/cuda/disparity.hpp @@ -19,6 +19,16 @@ void mask_occlusions(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &mask, const ftl::rgbd::Camera &c, cudaStream_t stream); +void check_reprojection(const cv::cuda::GpuMat &disp, + const ftl::cuda::TextureObject<uchar4> &left, const ftl::cuda::TextureObject<uchar4> &right, + cudaStream_t stream); + +void show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, const cv::cuda::GpuMat &right, + float scale, cudaStream_t stream); + +void show_disp_density(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, + float scale, cudaStream_t stream); + void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow, cv::cuda::GpuMat &history, cv::cuda::GpuMat &support, int n_max, float threshold, bool fill, diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp index 2f9a807cd..eea830cc4 100644 --- a/components/operators/include/ftl/operators/disparity.hpp +++ b/components/operators/include/ftl/operators/disparity.hpp @@ -11,6 +11,7 @@ #ifdef HAVE_LIBSGM #include <libsgm.h> +#include <opencv2/cudaimgproc.hpp> #endif namespace ftl { @@ -19,7 +20,7 @@ namespace operators { #ifdef HAVE_LIBSGM /* * FixstarsSGM https://github.com/fixstars/libSGM - * + * * Requires modified version https://gitlab.utu.fi/joseha/libsgm */ class FixstarsSGM : public ftl::operators::Operator { @@ -31,21 +32,28 @@ class FixstarsSGM : public ftl::operators::Operator { bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override; bool isMemoryHeavy() const override { return true; } - + private: bool init(); bool updateParameters(); - + bool updateP2Parameters(); + void computeP2(cudaStream_t &stream); + sgm::StereoSGM *ssgm_; cv::Size size_; cv::cuda::GpuMat lbw_; cv::cuda::GpuMat rbw_; cv::cuda::GpuMat disp_int_; + cv::cuda::GpuMat P2_map_; + cv::cuda::GpuMat edges_; + cv::Ptr<cv::cuda::CannyEdgeDetector> canny_; + int P1_; int P2_; int max_disp_; float uniqueness_; + bool use_P2_map_; }; #endif @@ -101,7 +109,7 @@ class DepthChannel : public ftl::operators::Operator { std::vector<cv::cuda::GpuMat> rbuf_; cv::Size depth_size_; - void _createPipeline(); + void _createPipeline(size_t); }; /* diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp index 59b50a1fe..5ffb6bbaf 100644 --- a/components/operators/src/depth.cpp +++ b/components/operators/src/depth.cpp @@ -131,7 +131,7 @@ DepthChannel::~DepthChannel() { } -void DepthChannel::_createPipeline() { +void DepthChannel::_createPipeline(size_t size) { if (pipe_ != nullptr) return; pipe_ = ftl::config::create<ftl::operators::Graph>(config(), "depth"); @@ -145,7 +145,7 @@ void DepthChannel::_createPipeline() { #endif #ifdef HAVE_OPTFLOW pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow); - pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity); + if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity); #endif pipe_->append<ftl::operators::DisparityBilateralFilter>("bilateral_filter"); //pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity); @@ -167,7 +167,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda if (!in.hasFrame(i)) continue; auto &f = in.frames[i]; if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) { - _createPipeline(); + _createPipeline(in.frames.size()); cv::cuda::GpuMat& left = f.get<cv::cuda::GpuMat>(Channel::Left); cv::cuda::GpuMat& right = f.get<cv::cuda::GpuMat>(Channel::Right); @@ -202,7 +202,7 @@ bool DepthChannel::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream auto &f = in; if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) { - _createPipeline(); + _createPipeline(1); cv::cuda::GpuMat& left = f.get<cv::cuda::GpuMat>(Channel::Left); cv::cuda::GpuMat& right = f.get<cv::cuda::GpuMat>(Channel::Right); diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu index c0011adf4..bed3bc3e8 100644 --- a/components/operators/src/disparity/disp2depth.cu +++ b/components/operators/src/disparity/disp2depth.cu @@ -137,3 +137,107 @@ void ftl::cuda::mask_occlusions(const cv::cuda::GpuMat &depth, const cv::cuda::G depth, depthR, mask, c); cudaSafeCall( cudaGetLastError() ); } + + +// ============================================================================= + +__global__ void check_reprojection_kernel(cv::cuda::PtrStepSz<short> disp, + ftl::cuda::TextureObject<uchar4> left, + ftl::cuda::TextureObject<uchar4> right) +{ + for (STRIDE_Y(v,disp.rows)) { + for (STRIDE_X(u,disp.cols)) { + const float d = float(disp(v,u)) / 16.0f; + const float4 l = left.tex2D(float(u)+0.5f, float(v)+0.5f); + + if (d > 0) { + const float4 r = right.tex2D(float(u-d)+0.5f, float(v)+0.5f); + const float diff = max(fabsf(l.x-r.x),max(fabsf(l.y-r.y), fabsf(l.z-r.z))); + if (diff > 10.0f) disp(v,u) = 0; + } + } + } +} + +void ftl::cuda::check_reprojection(const cv::cuda::GpuMat &disp, const ftl::cuda::TextureObject<uchar4> &left, const ftl::cuda::TextureObject<uchar4> &right, cudaStream_t stream) { + dim3 grid(1,1,1); + dim3 threads(128, 4, 1); + grid.x = cv::cuda::device::divUp(disp.cols, 128); + grid.y = cv::cuda::device::divUp(disp.rows, 4); + + check_reprojection_kernel<<<grid, threads, 0, stream>>>(disp, left, right); + + cudaSafeCall( cudaGetLastError() ); +} + + +// ============================================================================= + + +__global__ void show_rpe_kernel(cv::cuda::PtrStepSz<short> disp, + cv::cuda::PtrStepSz<uchar4> left, + cv::cuda::PtrStepSz<uchar4> right, + float scale) +{ + for (STRIDE_Y(v,left.rows)) { + for (STRIDE_X(u,left.cols)) { + short d = disp(v,u) / 16; + + if (d > 0 && u-d >= 0) { + uchar4 l = left(v,u); + uchar4 r = right(v,u-d); + float d = max(abs(int(l.x)-int(r.x)),max(abs(int(l.y)-int(r.y)), abs(int(l.z)-int(r.z)))); + + left(v,u) = make_uchar4(0,0,min(255.0f, (d/scale) * 255.0f),255); + } + } + } +} + +void ftl::cuda::show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, const cv::cuda::GpuMat &right, + float scale, cudaStream_t stream) { + dim3 grid(1,1,1); + dim3 threads(128, 4, 1); + grid.x = cv::cuda::device::divUp(disp.cols, 128); + grid.y = cv::cuda::device::divUp(disp.rows, 4); + show_rpe_kernel<<<grid, threads, 0, stream>>>( + disp, left, right, scale); + cudaSafeCall( cudaGetLastError() ); +} + +// ============================================================================= + + +template <int MAX_DISP> +__global__ void show_disp_density_kernel(cv::cuda::PtrStepSz<short> disp, + cv::cuda::PtrStepSz<uchar4> left, + float scale) +{ + for (STRIDE_Y(v,disp.rows)) { + for (STRIDE_X(u,disp.cols)) { + short d = disp(v,u) / 16; + int count = 0; + + for (int i=1; i<MAX_DISP; ++i) { + if (u+i-d < disp.cols && u+i-d >= 0) { + short dd = disp(v,u+i-d) / 16; + if (d > 0 && dd == i) ++count; + } + } + + count = max(0,count-1); + left(v,u) = make_uchar4(0,0,min(255.0f, (float(count)/4.0f) * 255.0f),255); + } + } +} + +void ftl::cuda::show_disp_density(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, + float scale, cudaStream_t stream) { + dim3 grid(1,1,1); + dim3 threads(128, 4, 1); + grid.x = cv::cuda::device::divUp(disp.cols, 128); + grid.y = cv::cuda::device::divUp(disp.rows, 4); + show_disp_density_kernel<256><<<grid, threads, 0, stream>>>( + disp, left, scale); + cudaSafeCall( cudaGetLastError() ); +} diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp index d5bfe2d6a..b55a7af6d 100644 --- a/components/operators/src/disparity/fixstars_sgm.cpp +++ b/components/operators/src/disparity/fixstars_sgm.cpp @@ -1,6 +1,7 @@ #include <loguru.hpp> #include "ftl/operators/disparity.hpp" +#include <ftl/operators/cuda/disparity.hpp> #include <opencv2/cudaimgproc.hpp> #include <opencv2/cudaarithm.hpp> @@ -14,9 +15,22 @@ using ftl::rgbd::Frame; using ftl::rgbd::Source; using ftl::operators::FixstarsSGM; +void FixstarsSGM::computeP2(cudaStream_t &stream) { + const int P3 = config()->value("P3", P2_); + auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); + P2_map_.setTo(P3, cvstream); + + if (config()->value("use_P2_map", false)) { + edges_.create(size_, CV_8UC1); + auto ptr = canny_; + ptr->detect(lbw_, edges_, cvstream); + P2_map_.setTo(P2_, edges_, cvstream); + } +} + FixstarsSGM::FixstarsSGM(ftl::Configurable* cfg) : ftl::operators::Operator(cfg) { - + ssgm_ = nullptr; size_ = Size(0, 0); @@ -77,6 +91,16 @@ FixstarsSGM::FixstarsSGM(ftl::Configurable* cfg) : updateParameters(); } }); + + updateP2Parameters(); + + cfg->on("canny_low", [this, cfg](const ftl::config::Event&) { + updateP2Parameters(); + }); + + cfg->on("canny_high", [this, cfg](const ftl::config::Event&) { + updateP2Parameters(); + }); } FixstarsSGM::~FixstarsSGM() { @@ -100,6 +124,7 @@ bool FixstarsSGM::init() { sgm::StereoSGM::Parameters(P1_, P2_, uniqueness_, true) ); + P2_map_.create(size_, CV_8UC1); return true; } @@ -109,13 +134,20 @@ bool FixstarsSGM::updateParameters() { sgm::StereoSGM::Parameters(P1_, P2_, uniqueness_, true)); } +bool FixstarsSGM::updateP2Parameters() { + canny_ = cv::cuda::createCannyEdgeDetector( + config()->value("canny_low", 30.0), + config()->value("canny_high", 120.0)); + return true; +} + bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { if (!in.hasChannel(Channel::Left) || !in.hasChannel(Channel::Right)) { LOG(ERROR) << "Fixstars is missing Left or Right channel"; return false; } - const auto &l = in.get<GpuMat>(Channel::Left); + auto &l = in.get<GpuMat>(Channel::Left); const auto &r = in.get<GpuMat>(Channel::Right); if (l.size() != size_) { @@ -129,14 +161,32 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream); cv::cuda::cvtColor(r, rbw_, cv::COLOR_BGRA2GRAY, 0, cvstream); - cvstream.waitForCompletion(); - ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data); + //cvstream.waitForCompletion(); + computeP2(stream); + //if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step; + ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, stream); // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); // left_pixels.setTo(0); cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); - + + if (config()->value("check_reprojection", false)) { + ftl::cuda::check_reprojection(disp, in.getTexture<uchar4>(Channel::Colour), + in.createTexture<uchar4>(Channel::Colour2, true), + stream); + } + + if (config()->value("show_P2_map", false)) { + cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA); + } + if (config()->value("show_rpe", false)) { + ftl::cuda::show_rpe(disp, l, r, 100.0f, stream); + } + if (config()->value("show_disp_density", false)) { + ftl::cuda::show_disp_density(disp, l, 100.0f, stream); + } + //disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream); return true; } diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h index 7e69ed8ae..ae6365675 100644 --- a/lib/libsgm/include/libsgm.h +++ b/lib/libsgm/include/libsgm.h @@ -28,6 +28,7 @@ limitations under the License. #include <stdint.h> #include "libsgm_config.h" +#include <cuda_runtime.h> #if defined(LIBSGM_SHARED) #if defined(WIN32) || defined(_WIN32) @@ -113,13 +114,13 @@ namespace sgm { * The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16. * Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream); /** * Same as execute(left_pixels, right_pixels, dst) with image size parameters. * Dimensions must be smaller or equal to dimensions provided in constructor. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream); /** * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu index f0a58247b..fb7282250 100644 --- a/lib/libsgm/src/check_consistency.cu +++ b/lib/libsgm/src/check_consistency.cu @@ -45,29 +45,29 @@ namespace { namespace sgm { namespace details { - void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) { + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) { const dim3 blocks(width / 16, height / 16); const dim3 threads(16, 16); if (depth_bits == 16) { - check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); } else if (depth_bits == 8) { - check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); } CudaKernelCheck(); } - void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) { + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) { const dim3 blocks(width / 16, height / 16); const dim3 threads(16, 16); if (depth_bits == 16) { - check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); } else if (depth_bits == 8) { - check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); } CudaKernelCheck(); diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu index 2a9f1a87f..b3772b6f6 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.cu +++ b/lib/libsgm/src/horizontal_path_aggregation.cu @@ -31,12 +31,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK; template <int DIRECTION, unsigned int MAX_DISPARITY> __global__ void aggregate_horizontal_path_kernel( uint8_t *dest, - const feature_type *left, - const feature_type *right, + const feature_type* __restrict__ left, + const feature_type* __restrict__ right, int width, int height, unsigned int p1, - unsigned int p2) + const uint8_t* __restrict__ p2, + int p2_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -69,6 +70,7 @@ __global__ void aggregate_horizontal_path_kernel( left += y0 * width; right += y0 * width; dest += y0 * MAX_DISPARITY * width; + p2 += y0 * p2_pitch; if(y0 >= height){ return; @@ -144,7 +146,7 @@ __global__ void aggregate_horizontal_path_kernel( for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){ local_costs[k] = __popc(left_value ^ right_buffer[j][k]); } - dp[j].update(local_costs, p1, p2, shfl_mask); + dp[j].update(local_costs, p1, p2[x], shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[j * dest_step + x * MAX_DISPARITY + dp_offset], dp[j].dp); @@ -163,7 +165,8 @@ void enqueue_aggregate_left2right_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -173,7 +176,7 @@ void enqueue_aggregate_left2right_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } template <unsigned int MAX_DISPARITY> @@ -184,7 +187,8 @@ void enqueue_aggregate_right2left_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -194,7 +198,7 @@ void enqueue_aggregate_right2left_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } @@ -205,7 +209,8 @@ template void enqueue_aggregate_left2right_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_left2right_path<128u>( @@ -215,7 +220,8 @@ template void enqueue_aggregate_left2right_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_left2right_path<256u>( @@ -225,7 +231,8 @@ template void enqueue_aggregate_left2right_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_right2left_path<64u>( @@ -235,7 +242,8 @@ template void enqueue_aggregate_right2left_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_right2left_path<128u>( @@ -245,7 +253,8 @@ template void enqueue_aggregate_right2left_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_right2left_path<256u>( @@ -255,7 +264,8 @@ template void enqueue_aggregate_right2left_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp index ce825e5fa..950c52bca 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.hpp +++ b/lib/libsgm/src/horizontal_path_aggregation.hpp @@ -30,7 +30,8 @@ void enqueue_aggregate_left2right_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -41,7 +42,8 @@ void enqueue_aggregate_right2left_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h index 577101728..7d836f5d8 100644 --- a/lib/libsgm/src/internal.h +++ b/lib/libsgm/src/internal.h @@ -33,11 +33,11 @@ limitations under the License. namespace sgm { namespace details { - void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch); - void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch); + void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream); + void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream); - void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel); - void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel); + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream); + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream); void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements); void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements); diff --git a/lib/libsgm/src/median_filter.cu b/lib/libsgm/src/median_filter.cu index fffe24f04..7e9311bb6 100644 --- a/lib/libsgm/src/median_filter.cu +++ b/lib/libsgm/src/median_filter.cu @@ -256,33 +256,33 @@ namespace { namespace sgm { namespace details { - void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch) { + void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream) { if (pitch % 4 == 0) { const dim3 block(BLOCK_X, BLOCK_Y); const dim3 grid(divup(width / 4, block.x), divup(height, block.y)); - median_kernel_3x3_8u_v4<<<grid, block>>>(d_src, d_dst, width, height, pitch); + median_kernel_3x3_8u_v4<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); } else { const dim3 block(BLOCK_X, BLOCK_Y); const dim3 grid(divup(width, block.x), divup(height, block.y)); - median_kernel_3x3_8u<<<grid, block>>>(d_src, d_dst, width, height, pitch); + median_kernel_3x3_8u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); } CudaSafeCall(cudaGetLastError()); } - void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch) { + void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream) { if (pitch % 2 == 0) { const dim3 block(BLOCK_X, BLOCK_Y); const dim3 grid(divup(width / 2, block.x), divup(height, block.y)); - median_kernel_3x3_16u_v2<<<grid, block>>>(d_src, d_dst, width, height, pitch); + median_kernel_3x3_16u_v2<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); } else { const dim3 block(BLOCK_X, BLOCK_Y); const dim3 grid(divup(width, block.x), divup(height, block.y)); - median_kernel_3x3_16u<<<grid, block>>>(d_src, d_dst, width, height, pitch); + median_kernel_3x3_16u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); } CudaSafeCall(cudaGetLastError()); diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu index 21785c947..65934fb1b 100644 --- a/lib/libsgm/src/oblique_path_aggregation.cu +++ b/lib/libsgm/src/oblique_path_aggregation.cu @@ -27,12 +27,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u; template <int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY> __global__ void aggregate_oblique_path_kernel( uint8_t *dest, - const feature_type *left, - const feature_type *right, + const feature_type* __restrict__ left, + const feature_type* __restrict__ right, int width, int height, unsigned int p1, - unsigned int p2) + const uint8_t* __restrict__ p2, + int p2_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -104,7 +105,7 @@ __global__ void aggregate_oblique_path_kernel( for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ local_costs[j] = __popc(left_value ^ right_values[j]); } - dp.update(local_costs, p1, p2, shfl_mask); + dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], dp.dp); @@ -122,7 +123,8 @@ void enqueue_aggregate_upleft2downright_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -131,7 +133,7 @@ void enqueue_aggregate_upleft2downright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } template <unsigned int MAX_DISPARITY> @@ -142,7 +144,8 @@ void enqueue_aggregate_upright2downleft_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -151,7 +154,7 @@ void enqueue_aggregate_upright2downleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } template <unsigned int MAX_DISPARITY> @@ -162,7 +165,8 @@ void enqueue_aggregate_downright2upleft_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -171,7 +175,7 @@ void enqueue_aggregate_downright2upleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } template <unsigned int MAX_DISPARITY> @@ -182,7 +186,8 @@ void enqueue_aggregate_downleft2upright_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -191,7 +196,7 @@ void enqueue_aggregate_downleft2upright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } @@ -202,7 +207,8 @@ template void enqueue_aggregate_upleft2downright_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_upleft2downright_path<128u>( @@ -212,7 +218,8 @@ template void enqueue_aggregate_upleft2downright_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_upleft2downright_path<256u>( @@ -222,7 +229,8 @@ template void enqueue_aggregate_upleft2downright_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<64u>( @@ -232,7 +240,8 @@ template void enqueue_aggregate_upright2downleft_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<128u>( @@ -242,7 +251,8 @@ template void enqueue_aggregate_upright2downleft_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<256u>( @@ -252,7 +262,8 @@ template void enqueue_aggregate_upright2downleft_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<64u>( @@ -262,7 +273,8 @@ template void enqueue_aggregate_downright2upleft_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<128u>( @@ -272,7 +284,8 @@ template void enqueue_aggregate_downright2upleft_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<256u>( @@ -282,7 +295,8 @@ template void enqueue_aggregate_downright2upleft_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<64u>( @@ -292,7 +306,8 @@ template void enqueue_aggregate_downleft2upright_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<128u>( @@ -302,7 +317,8 @@ template void enqueue_aggregate_downleft2upright_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<256u>( @@ -312,7 +328,8 @@ template void enqueue_aggregate_downleft2upright_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp index a4f949b1e..36d61513c 100644 --- a/lib/libsgm/src/oblique_path_aggregation.hpp +++ b/lib/libsgm/src/oblique_path_aggregation.hpp @@ -30,7 +30,8 @@ void enqueue_aggregate_upleft2downright_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -41,7 +42,8 @@ void enqueue_aggregate_upright2downleft_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -52,7 +54,8 @@ void enqueue_aggregate_downright2upleft_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -63,7 +66,8 @@ void enqueue_aggregate_downleft2upright_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu index 117cac507..d0eac331b 100644 --- a/lib/libsgm/src/path_aggregation.cu +++ b/lib/libsgm/src/path_aggregation.cu @@ -47,7 +47,8 @@ void PathAggregation<MAX_DISPARITY>::enqueue( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS; @@ -58,28 +59,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue( cudaStreamSynchronize(stream); path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>( m_cost_buffer.data() + 0 * buffer_step, - left, right, width, height, p1, p2, m_streams[0]); + left, right, width, height, p1, p2, p2_pitch, m_streams[0]); path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>( m_cost_buffer.data() + 1 * buffer_step, - left, right, width, height, p1, p2, m_streams[1]); + left, right, width, height, p1, p2, p2_pitch, m_streams[1]); path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>( m_cost_buffer.data() + 2 * buffer_step, - left, right, width, height, p1, p2, m_streams[2]); + left, right, width, height, p1, p2, p2_pitch, m_streams[2]); path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>( m_cost_buffer.data() + 3 * buffer_step, - left, right, width, height, p1, p2, m_streams[3]); + left, right, width, height, p1, p2, p2_pitch, m_streams[3]); path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>( m_cost_buffer.data() + 4 * buffer_step, - left, right, width, height, p1, p2, m_streams[4]); + left, right, width, height, p1, p2, p2_pitch, m_streams[4]); path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 5 * buffer_step, - left, right, width, height, p1, p2, m_streams[5]); + left, right, width, height, p1, p2, p2_pitch, m_streams[5]); path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 6 * buffer_step, - left, right, width, height, p1, p2, m_streams[6]); + left, right, width, height, p1, p2, p2_pitch, m_streams[6]); path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>( m_cost_buffer.data() + 7 * buffer_step, - left, right, width, height, p1, p2, m_streams[7]); + left, right, width, height, p1, p2, p2_pitch, m_streams[7]); for(unsigned int i = 0; i < NUM_PATHS; ++i){ cudaEventRecord(m_events[i], m_streams[i]); cudaStreamWaitEvent(stream, m_events[i], 0); diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp index 3d2b35d8e..221411d26 100644 --- a/lib/libsgm/src/path_aggregation.hpp +++ b/lib/libsgm/src/path_aggregation.hpp @@ -46,7 +46,8 @@ public: int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); }; diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu index d276c3fa9..fc62a0d47 100644 --- a/lib/libsgm/src/sgm.cu +++ b/lib/libsgm/src/sgm.cu @@ -52,7 +52,7 @@ public: int src_pitch, int dst_pitch, unsigned int penalty1, - unsigned int penalty2, + const uint8_t *penalty2, float uniqueness, bool subpixel, cudaStream_t stream) @@ -66,6 +66,7 @@ public: m_census_right.get_output(), width, height, penalty1, penalty2, + src_pitch, stream); m_winner_takes_all.enqueue( dest_left, dest_right, @@ -97,9 +98,10 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( int src_pitch, int dst_pitch, unsigned int penalty1, - unsigned int penalty2, + const uint8_t *penalty2, float uniqueness, - bool subpixel) + bool subpixel, + cudaStream_t stream) { m_impl->enqueue( dest_left, dest_right, @@ -108,8 +110,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( src_pitch, dst_pitch, penalty1, penalty2, uniqueness, subpixel, - 0); - cudaStreamSynchronize(0); + stream); + //cudaStreamSynchronize(0); } template <typename T, size_t MAX_DISPARITY> @@ -123,7 +125,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( int src_pitch, int dst_pitch, unsigned int penalty1, - unsigned int penalty2, + const uint8_t *penalty2, float uniqueness, bool subpixel, cudaStream_t stream) @@ -144,5 +146,6 @@ template class SemiGlobalMatching<uint8_t, 128>; template class SemiGlobalMatching<uint8_t, 256>; template class SemiGlobalMatching<uint16_t, 64>; template class SemiGlobalMatching<uint16_t, 128>; +template class SemiGlobalMatching<uint16_t, 256>; } diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp index afb308a25..effbe2f9e 100644 --- a/lib/libsgm/src/sgm.hpp +++ b/lib/libsgm/src/sgm.hpp @@ -48,9 +48,10 @@ public: int src_pitch, int dst_pitch, unsigned int penalty1, - unsigned int penalty2, + const uint8_t *penalty2, float uniqueness, - bool subpixel); + bool subpixel, + cudaStream_t stream); void enqueue( output_type *dest_left, @@ -62,7 +63,7 @@ public: int src_pitch, int dst_pitch, unsigned int penalty1, - unsigned int penalty2, + const uint8_t *penalty2, float uniqueness, bool subpixel, cudaStream_t stream); diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp index b0219091f..252f16d9e 100644 --- a/lib/libsgm/src/stereo_sgm.cpp +++ b/lib/libsgm/src/stereo_sgm.cpp @@ -29,7 +29,7 @@ namespace sgm { public: using output_type = sgm::output_type; virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) = 0; + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) = 0; virtual ~SemiGlobalMatchingBase() {} }; @@ -38,9 +38,9 @@ namespace sgm { class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase { public: void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) override + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) override { - sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel); + sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel, stream); } private: SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_; @@ -149,7 +149,7 @@ namespace sgm { if (cu_res_) { delete cu_res_; } } - void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch) { + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream) { const void *d_input_left, *d_input_right; @@ -173,11 +173,11 @@ namespace sgm { d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp, - d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, param_.P2, param_.uniqueness, param_.subpixel); + d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, param_.uniqueness, param_.subpixel, stream); - sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch); - sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch); - sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel); + sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream); + sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream); + sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, stream); if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) { sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height); @@ -197,8 +197,8 @@ namespace sgm { } } - void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) { - execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_); + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream) { + execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, stream); } bool StereoSGM::updateParameters(const Parameters ¶ms) { diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu index d318f46b4..54ebafc9c 100644 --- a/lib/libsgm/src/vertical_path_aggregation.cu +++ b/lib/libsgm/src/vertical_path_aggregation.cu @@ -27,12 +27,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u; template <int DIRECTION, unsigned int MAX_DISPARITY> __global__ void aggregate_vertical_path_kernel( uint8_t *dest, - const feature_type *left, - const feature_type *right, + const feature_type* __restrict__ left, + const feature_type* __restrict__ right, int width, int height, unsigned int p1, - unsigned int p2) + const uint8_t* __restrict__ p2, + int p2_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -102,7 +103,7 @@ __global__ void aggregate_vertical_path_kernel( for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ local_costs[j] = __popc(left_value ^ right_values[j]); } - dp.update(local_costs, p1, p2, shfl_mask); + dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], dp.dp); @@ -119,7 +120,8 @@ void enqueue_aggregate_up2down_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -128,7 +130,7 @@ void enqueue_aggregate_up2down_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } template <unsigned int MAX_DISPARITY> @@ -139,7 +141,8 @@ void enqueue_aggregate_down2up_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -148,7 +151,7 @@ void enqueue_aggregate_down2up_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2); + dest, left, right, width, height, p1, p2, p2_pitch); } @@ -159,7 +162,8 @@ template void enqueue_aggregate_up2down_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_up2down_path<128u>( @@ -169,7 +173,8 @@ template void enqueue_aggregate_up2down_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_up2down_path<256u>( @@ -179,7 +184,8 @@ template void enqueue_aggregate_up2down_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_down2up_path<64u>( @@ -189,7 +195,8 @@ template void enqueue_aggregate_down2up_path<64u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_down2up_path<128u>( @@ -199,7 +206,8 @@ template void enqueue_aggregate_down2up_path<128u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template void enqueue_aggregate_down2up_path<256u>( @@ -209,7 +217,8 @@ template void enqueue_aggregate_down2up_path<256u>( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp index 0ae04016a..fb7334adc 100644 --- a/lib/libsgm/src/vertical_path_aggregation.hpp +++ b/lib/libsgm/src/vertical_path_aggregation.hpp @@ -30,7 +30,8 @@ void enqueue_aggregate_up2down_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -41,7 +42,8 @@ void enqueue_aggregate_down2up_path( int width, int height, unsigned int p1, - unsigned int p2, + const uint8_t *p2, + int p2_pitch, cudaStream_t stream); } -- GitLab