diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp index 17cdced1d424566164dbc85eb22314bf3e24ad41..62b1f00857a1d8971f61248cda857814d49fc24e 100644 --- a/components/operators/include/ftl/operators/disparity.hpp +++ b/components/operators/include/ftl/operators/disparity.hpp @@ -65,6 +65,8 @@ class FixstarsSGM : public ftl::operators::Operator { cv::cuda::GpuMat disp_int_; cv::cuda::GpuMat P2_map_; + cv::cuda::GpuMat weights_; + cv::cuda::GpuMat weightsF_; cv::cuda::GpuMat edges_; cv::Ptr<cv::cuda::CannyEdgeDetector> canny_; diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp index 0a8bdf0dd016c2137286b06bff78dd6e72a19560..8ad3a41fb3e7bf034146336ebf8d5464e0d3f9e7 100644 --- a/components/operators/src/disparity/fixstars_sgm.cpp +++ b/components/operators/src/disparity/fixstars_sgm.cpp @@ -5,6 +5,7 @@ #include <opencv2/cudaimgproc.hpp> #include <opencv2/cudaarithm.hpp> +#include <opencv2/cudafilters.hpp> using cv::Size; using cv::cuda::GpuMat; @@ -15,6 +16,33 @@ using ftl::rgbd::Frame; using ftl::rgbd::Source; using ftl::operators::FixstarsSGM; + +static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) { + if (in.isGpuMat() && out.isGpuMat()) { + cv::cuda::GpuMat im; + cv::cuda::GpuMat im2; + cv::cuda::GpuMat mean; + cv::cuda::GpuMat mean2; + + mean.create(in.size(), CV_32FC1); + mean2.create(in.size(), CV_32FC1); + im2.create(in.size(), CV_32FC1); + in.getGpuMat().convertTo(im, CV_32FC1, cvstream); + + cv::cuda::multiply(im, im, im2, 1.0, CV_32FC1, cvstream); + auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize)); + filter->apply(im, mean, cvstream); // E[X] + filter->apply(im2, mean2, cvstream); // E[X^2] + cv::cuda::multiply(mean, mean, mean, 1.0, -1, cvstream); // (E[X])^2 + + // NOTE: floating point accuracy in subtraction + // (cv::cuda::createBoxFilter only supports float and 8 bit integer types) + cv::cuda::subtract(mean2, mean, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2 + } + else { throw std::exception(); /* todo CPU version */ } +} + + void FixstarsSGM::computeP2(cudaStream_t &stream) { const int P3 = config()->value("P3", P2_); auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream); @@ -115,6 +143,8 @@ bool FixstarsSGM::init() { lbw_.create(size_, CV_8UC1); rbw_.create(size_, CV_8UC1); disp_int_.create(size_, CV_16SC1); + weights_.create(size_, CV_32FC1); + weights_.setTo(1.0); LOG(INFO) << "INIT FIXSTARS"; @@ -164,8 +194,19 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { //cvstream.waitForCompletion(); computeP2(stream); - //if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step; - ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, stream); + + bool use_variance = config()->value("use_variance", true); + if (use_variance) { + variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream); + float minweight = std::min(1.0f, std::max(0.0f, config()->value("var_minweight", 0.5f))); + cv::cuda::normalize(weightsF_, weightsF_, minweight, 1.0, cv::NORM_MINMAX, -1, cv::noArray(), cvstream); + weightsF_.convertTo(weights_, CV_8UC1, 255.0f); + + //if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step; + ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream); + } else { + ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, stream); + } // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); // left_pixels.setTo(0); diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h index ae63656752f730f28d41e18141be125ecd86863b..279b3e5af2557fc2cbb93e660c96edfca6c1a006 100644 --- a/lib/libsgm/include/libsgm.h +++ b/lib/libsgm/include/libsgm.h @@ -84,7 +84,7 @@ namespace sgm { * @attention * output_depth_bits must be set to 16 when subpixel is enabled. */ - LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, + LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, EXECUTE_INOUT inout_type, const Parameters& param = Parameters()); /** @@ -114,13 +114,13 @@ namespace sgm { * The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16. * Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream); /** * Same as execute(left_pixels, right_pixels, dst) with image size parameters. * Dimensions must be smaller or equal to dimensions provided in constructor. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream); /** * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values @@ -129,7 +129,7 @@ namespace sgm { LIBSGM_API void setMask(uint8_t* mask, int pitch); /** - * Update parameters. Returns true if successful. + * Update parameters. Returns true if successful. */ LIBSGM_API bool updateParameters(const Parameters ¶ms); diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu index 860bb3d69a77924bf6953b42a7be8f297a6e71a9..5eba5372c3d67e9396aaf9569c8e7240c0c98759 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.cu +++ b/lib/libsgm/src/horizontal_path_aggregation.cu @@ -37,7 +37,9 @@ __global__ void aggregate_horizontal_path_kernel( int height, unsigned int p1, const uint8_t* __restrict__ p2, - int p2_pitch) + int p2_pitch, + const uint8_t* __restrict__ w, + int w_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -146,7 +148,7 @@ __global__ void aggregate_horizontal_path_kernel( for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){ local_costs[k] = __popc(left_value ^ right_buffer[j][k]); } - dp[j].update(local_costs, p1, p2[x], shfl_mask); + dp[j].update(local_costs, p1, p2[x], w ? float(w[x])/255.0f : 1.0f, shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[j * dest_step + x * MAX_DISPARITY + dp_offset], dp[j].dp); @@ -167,6 +169,8 @@ void enqueue_aggregate_left2right_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -176,7 +180,7 @@ void enqueue_aggregate_left2right_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } template <unsigned int MAX_DISPARITY> @@ -189,6 +193,8 @@ void enqueue_aggregate_right2left_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -198,7 +204,7 @@ void enqueue_aggregate_right2left_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } @@ -211,6 +217,8 @@ template void enqueue_aggregate_left2right_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_left2right_path<128u>( @@ -222,8 +230,10 @@ template void enqueue_aggregate_left2right_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_left2right_path<256u>( cost_type *dest, const feature_type *left, @@ -233,6 +243,8 @@ template void enqueue_aggregate_left2right_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_right2left_path<64u>( @@ -244,6 +256,8 @@ template void enqueue_aggregate_right2left_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_right2left_path<128u>( @@ -255,8 +269,10 @@ template void enqueue_aggregate_right2left_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_right2left_path<256u>( cost_type *dest, const feature_type *left, @@ -266,6 +282,8 @@ template void enqueue_aggregate_right2left_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp index 950c52bca117252a02a18c78e40918b8eb5b8265..1b7a7e088f5d4dac793694ba7f4a0a876de2538b 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.hpp +++ b/lib/libsgm/src/horizontal_path_aggregation.hpp @@ -32,6 +32,8 @@ void enqueue_aggregate_left2right_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -44,6 +46,8 @@ void enqueue_aggregate_right2left_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu index 65934fb1bf53ba00e1f35fa577b83a028203ec07..97d9b1493c32e54a7bb1195f9a9261243b801444 100644 --- a/lib/libsgm/src/oblique_path_aggregation.cu +++ b/lib/libsgm/src/oblique_path_aggregation.cu @@ -33,7 +33,9 @@ __global__ void aggregate_oblique_path_kernel( int height, unsigned int p1, const uint8_t* __restrict__ p2, - int p2_pitch) + int p2_pitch, + const uint8_t* __restrict__ w, + int w_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -105,7 +107,7 @@ __global__ void aggregate_oblique_path_kernel( for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ local_costs[j] = __popc(left_value ^ right_values[j]); } - dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask); + dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], dp.dp); @@ -125,6 +127,8 @@ void enqueue_aggregate_upleft2downright_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -133,7 +137,7 @@ void enqueue_aggregate_upleft2downright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } template <unsigned int MAX_DISPARITY> @@ -146,6 +150,8 @@ void enqueue_aggregate_upright2downleft_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -154,7 +160,7 @@ void enqueue_aggregate_upright2downleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } template <unsigned int MAX_DISPARITY> @@ -167,6 +173,8 @@ void enqueue_aggregate_downright2upleft_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -175,7 +183,7 @@ void enqueue_aggregate_downright2upleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } template <unsigned int MAX_DISPARITY> @@ -188,6 +196,8 @@ void enqueue_aggregate_downleft2upright_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -196,7 +206,7 @@ void enqueue_aggregate_downleft2upright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } @@ -209,6 +219,8 @@ template void enqueue_aggregate_upleft2downright_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_upleft2downright_path<128u>( @@ -220,8 +232,10 @@ template void enqueue_aggregate_upleft2downright_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_upleft2downright_path<256u>( cost_type *dest, const feature_type *left, @@ -231,6 +245,8 @@ template void enqueue_aggregate_upleft2downright_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<64u>( @@ -242,6 +258,8 @@ template void enqueue_aggregate_upright2downleft_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<128u>( @@ -253,8 +271,10 @@ template void enqueue_aggregate_upright2downleft_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_upright2downleft_path<256u>( cost_type *dest, const feature_type *left, @@ -264,6 +284,8 @@ template void enqueue_aggregate_upright2downleft_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<64u>( @@ -275,6 +297,8 @@ template void enqueue_aggregate_downright2upleft_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<128u>( @@ -286,8 +310,10 @@ template void enqueue_aggregate_downright2upleft_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_downright2upleft_path<256u>( cost_type *dest, const feature_type *left, @@ -297,6 +323,8 @@ template void enqueue_aggregate_downright2upleft_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<64u>( @@ -308,6 +336,8 @@ template void enqueue_aggregate_downleft2upright_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<128u>( @@ -319,8 +349,10 @@ template void enqueue_aggregate_downleft2upright_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_downleft2upright_path<256u>( cost_type *dest, const feature_type *left, @@ -330,6 +362,8 @@ template void enqueue_aggregate_downleft2upright_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp index 36d61513c7693594fb65a4f38679083714694788..6504f37a2715ef64d7698b00afb5f6b0614b26af 100644 --- a/lib/libsgm/src/oblique_path_aggregation.hpp +++ b/lib/libsgm/src/oblique_path_aggregation.hpp @@ -32,6 +32,8 @@ void enqueue_aggregate_upleft2downright_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -44,6 +46,8 @@ void enqueue_aggregate_upright2downleft_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -56,6 +60,8 @@ void enqueue_aggregate_downright2upleft_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -68,6 +74,8 @@ void enqueue_aggregate_downleft2upright_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu index d0eac331b580b1df493f41a7a280779fe5611908..16567de556520b10a8fcbb3560bcd667ab1fb7b6 100644 --- a/lib/libsgm/src/path_aggregation.cu +++ b/lib/libsgm/src/path_aggregation.cu @@ -49,6 +49,8 @@ void PathAggregation<MAX_DISPARITY>::enqueue( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS; @@ -59,28 +61,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue( cudaStreamSynchronize(stream); path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>( m_cost_buffer.data() + 0 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[0]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]); path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>( m_cost_buffer.data() + 1 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[1]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[1]); path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>( m_cost_buffer.data() + 2 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[2]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[2]); path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>( m_cost_buffer.data() + 3 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[3]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[3]); path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>( m_cost_buffer.data() + 4 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[4]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[4]); path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 5 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[5]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[5]); path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 6 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[6]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[6]); path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>( m_cost_buffer.data() + 7 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, m_streams[7]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[7]); for(unsigned int i = 0; i < NUM_PATHS; ++i){ cudaEventRecord(m_events[i], m_streams[i]); cudaStreamWaitEvent(stream, m_events[i], 0); diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp index 221411d2687c084c824d47f2eaba40fee23306b2..0b019a3b556fb92969acff843ad7431bc3d57b0e 100644 --- a/lib/libsgm/src/path_aggregation.hpp +++ b/lib/libsgm/src/path_aggregation.hpp @@ -31,7 +31,7 @@ private: DeviceBuffer<cost_type> m_cost_buffer; cudaStream_t m_streams[NUM_PATHS]; cudaEvent_t m_events[NUM_PATHS]; - + public: PathAggregation(); ~PathAggregation(); @@ -48,6 +48,8 @@ public: unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); }; diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp index b560e732ff20e5983186e9137b70ab91e76a5f9f..c897684e7003512ccd0c9e17d7a084a39319766b 100644 --- a/lib/libsgm/src/path_aggregation_common.hpp +++ b/lib/libsgm/src/path_aggregation_common.hpp @@ -32,7 +32,7 @@ struct DynamicProgramming { DP_BLOCK_SIZE >= 2, "DP_BLOCK_SIZE must be greater than or equal to 2"); static_assert( - (SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, + (SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, "SUBGROUP_SIZE must be a power of 2"); uint32_t last_min; @@ -45,7 +45,7 @@ struct DynamicProgramming { } __device__ void update( - uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask) + uint32_t *local_costs, uint32_t p1, uint32_t p2, float w, uint32_t mask) { const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; @@ -62,14 +62,14 @@ struct DynamicProgramming { uint32_t out = min(dp[k] - last_min, p2); if(lane_id != 0){ out = min(out, prev - last_min + p1); } out = min(out, dp[k + 1] - last_min + p1); - lazy_out = local_min = out + local_costs[k]; + lazy_out = local_min = out + round(local_costs[k]*w); } for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){ uint32_t out = min(dp[k] - last_min, p2); out = min(out, dp[k - 1] - last_min + p1); out = min(out, dp[k + 1] - last_min + p1); dp[k - 1] = lazy_out; - lazy_out = out + local_costs[k]; + lazy_out = out + round(local_costs[k]*w); local_min = min(local_min, lazy_out); } { @@ -85,7 +85,7 @@ struct DynamicProgramming { out = min(out, next - last_min + p1); } dp[k - 1] = lazy_out; - dp[k] = out + local_costs[k]; + dp[k] = out + round(local_costs[k]*w); local_min = min(local_min, dp[k]); } last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask); diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu index fc62a0d47ba30a9af24e58eb0a513ab2eab64501..2b350da9562962aa9e10b4554bb6b6c4517e1023 100644 --- a/lib/libsgm/src/sgm.cu +++ b/lib/libsgm/src/sgm.cu @@ -53,6 +53,8 @@ public: int dst_pitch, unsigned int penalty1, const uint8_t *penalty2, + const uint8_t *weights, + int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) @@ -66,7 +68,8 @@ public: m_census_right.get_output(), width, height, penalty1, penalty2, - src_pitch, + src_pitch, // bug? + weights, weights_pitch, stream); m_winner_takes_all.enqueue( dest_left, dest_right, @@ -99,6 +102,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( int dst_pitch, unsigned int penalty1, const uint8_t *penalty2, + const uint8_t *weights, + int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) @@ -109,6 +114,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( width, height, src_pitch, dst_pitch, penalty1, penalty2, + weights, weights_pitch, uniqueness, subpixel, stream); //cudaStreamSynchronize(0); @@ -126,6 +132,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( int dst_pitch, unsigned int penalty1, const uint8_t *penalty2, + const uint8_t *weights, + int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) @@ -136,6 +144,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( width, height, src_pitch, dst_pitch, penalty1, penalty2, + weights, weights_pitch, uniqueness, subpixel, stream); } diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp index effbe2f9e1a5a4ad181115547f3f0fc5242e88f8..f0c3c80ee107b95890d6dc8578c72768a70abdd4 100644 --- a/lib/libsgm/src/sgm.hpp +++ b/lib/libsgm/src/sgm.hpp @@ -49,6 +49,8 @@ public: int dst_pitch, unsigned int penalty1, const uint8_t *penalty2, + const uint8_t *weights, + int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream); @@ -64,6 +66,8 @@ public: int dst_pitch, unsigned int penalty1, const uint8_t *penalty2, + const uint8_t *weights, + int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream); diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp index 252f16d9ed7373d3e1a4c550a502d1ab57c199bc..a07f2daea087e38f4102c0cb92bf7d1664655c3a 100644 --- a/lib/libsgm/src/stereo_sgm.cpp +++ b/lib/libsgm/src/stereo_sgm.cpp @@ -28,8 +28,8 @@ namespace sgm { class SemiGlobalMatchingBase { public: using output_type = sgm::output_type; - virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) = 0; + virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) = 0; virtual ~SemiGlobalMatchingBase() {} }; @@ -38,9 +38,9 @@ namespace sgm { class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase { public: void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) override + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) override { - sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel, stream); + sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, stream); } private: SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_; @@ -54,7 +54,7 @@ namespace sgm { void* d_tmp_left_disp; void* d_tmp_right_disp; uint8_t* d_mask; - + SemiGlobalMatchingBase* sgm_engine; CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, int src_pitch_, int dst_pitch_, EXECUTE_INOUT inout_type_) { @@ -80,7 +80,7 @@ namespace sgm { CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * src_pitch_ * height_)); CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * src_pitch_ * height_)); } - + CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * dst_pitch_ * height_)); CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * dst_pitch_ * height_)); @@ -149,7 +149,8 @@ namespace sgm { if (cu_res_) { delete cu_res_; } } - void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream) { + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, + const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) { const void *d_input_left, *d_input_right; @@ -171,9 +172,9 @@ namespace sgm { if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer - + cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp, - d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, param_.uniqueness, param_.subpixel, stream); + d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, stream); sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream); sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream); @@ -197,8 +198,8 @@ namespace sgm { } } - void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream) { - execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, stream); + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) { + execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, stream); } bool StereoSGM::updateParameters(const Parameters ¶ms) { @@ -208,7 +209,7 @@ namespace sgm { if ((params.uniqueness < 0.0) || (params.uniqueness > 1.0)) { return false; } - + Parameters params_ = params; std::swap(params_, this->param_); return true; diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu index 54ebafc9c6c19af41db4052ec5af8edd04b4f570..6fee96892cf038a0583c576688e8456e0c3ec7d7 100644 --- a/lib/libsgm/src/vertical_path_aggregation.cu +++ b/lib/libsgm/src/vertical_path_aggregation.cu @@ -33,7 +33,9 @@ __global__ void aggregate_vertical_path_kernel( int height, unsigned int p1, const uint8_t* __restrict__ p2, - int p2_pitch) + int p2_pitch, + const uint8_t* __restrict__ w, + int w_pitch) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -103,7 +105,7 @@ __global__ void aggregate_vertical_path_kernel( for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ local_costs[j] = __popc(left_value ^ right_values[j]); } - dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask); + dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask); store_uint8_vector<DP_BLOCK_SIZE>( &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], dp.dp); @@ -122,6 +124,8 @@ void enqueue_aggregate_up2down_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -130,7 +134,7 @@ void enqueue_aggregate_up2down_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } template <unsigned int MAX_DISPARITY> @@ -143,6 +147,8 @@ void enqueue_aggregate_down2up_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -151,7 +157,7 @@ void enqueue_aggregate_down2up_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); } @@ -164,6 +170,8 @@ template void enqueue_aggregate_up2down_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_up2down_path<128u>( @@ -175,8 +183,10 @@ template void enqueue_aggregate_up2down_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_up2down_path<256u>( cost_type *dest, const feature_type *left, @@ -186,6 +196,8 @@ template void enqueue_aggregate_up2down_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_down2up_path<64u>( @@ -197,6 +209,8 @@ template void enqueue_aggregate_down2up_path<64u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template void enqueue_aggregate_down2up_path<128u>( @@ -208,8 +222,10 @@ template void enqueue_aggregate_down2up_path<128u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); - + template void enqueue_aggregate_down2up_path<256u>( cost_type *dest, const feature_type *left, @@ -219,6 +235,8 @@ template void enqueue_aggregate_down2up_path<256u>( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); } diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp index fb7334adc8f9f08439747df623a953f67cac57f9..434dd9231614b821ae1dfcbb439e648a9dd10b9a 100644 --- a/lib/libsgm/src/vertical_path_aggregation.hpp +++ b/lib/libsgm/src/vertical_path_aggregation.hpp @@ -32,6 +32,8 @@ void enqueue_aggregate_up2down_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -44,6 +46,8 @@ void enqueue_aggregate_down2up_path( unsigned int p1, const uint8_t *p2, int p2_pitch, + const uint8_t* w, + int w_pitch, cudaStream_t stream); }