diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp index 176dc8d10bf23ba91ebc3cf1976b7e2e08a96766..5604f134c3918522f990a52e4aa5115e3df99099 100644 --- a/components/operators/src/disparity/fixstars_sgm.cpp +++ b/components/operators/src/disparity/fixstars_sgm.cpp @@ -62,7 +62,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) : uniqueness_ = cfg->value("uniqueness", 0.95f); P1_ = cfg->value("P1", 10); P2_ = cfg->value("P2", 120); - max_disp_ = cfg->value("max_disp", 256); + max_disp_ = cfg->value("num_disp", 256); if (uniqueness_ < 0.0 || uniqueness_ > 1.0) { uniqueness_ = 1.0; @@ -79,7 +79,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) : LOG(WARNING) << "Invalid value for P2, using value of P1 (" << P1_ << ")"; } - if (!(max_disp_ == 256 || max_disp_ == 128)) { + if (!(max_disp_ == 256 || max_disp_ == 128 || max_disp_ == 192)) { max_disp_ = 256; LOG(WARNING) << "Invalid value for max_disp, using default value (256)"; } @@ -216,9 +216,9 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { weightsF_.convertTo(weights_, CV_8UC1, 255.0f, cvstream); //if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step; - ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream); + ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), config()->value("min_disp", 60), stream); } else { - ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, stream); + ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, config()->value("min_disp", 60), stream); } // GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows)); @@ -228,7 +228,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { ftl::cuda::merge_disparities(disp_int_, disp, stream); } - cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); + cv::cuda::threshold(disp_int_, disp, 16383.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream); if (config()->value("check_reprojection", false)) { ftl::cuda::check_reprojection(disp, in.getTexture<uchar4>(Channel::Colour), diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h index 279b3e5af2557fc2cbb93e660c96edfca6c1a006..bf2c58ea759181d3294d9cb717fd7e65b2b6d60a 100644 --- a/lib/libsgm/include/libsgm.h +++ b/lib/libsgm/include/libsgm.h @@ -114,13 +114,13 @@ namespace sgm { * The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16. * Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream); /** * Same as execute(left_pixels, right_pixels, dst) with image size parameters. * Dimensions must be smaller or equal to dimensions provided in constructor. */ - LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream); + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream); /** * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu index fb7282250f95a23d91a33ebf2e5bc38262d3f715..dde4f5f290e86db6c73a4ec03e3e44b4eee566d9 100644 --- a/lib/libsgm/src/check_consistency.cu +++ b/lib/libsgm/src/check_consistency.cu @@ -19,7 +19,7 @@ limitations under the License. namespace { template<typename SRC_T, typename DST_T> - __global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel) { + __global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel, int min_disp) { const int j = blockIdx.x * blockDim.x + threadIdx.x; const int i = blockIdx.y * blockDim.y + threadIdx.y; @@ -28,6 +28,7 @@ namespace { uint8_t mask = d_mask[i * src_pitch + j]; int d = d_leftDisp[i * dst_pitch + j]; + int dout = d + (min_disp << sgm::StereoSGM::SUBPIXEL_SHIFT); if (subpixel) { d >>= sgm::StereoSGM::SUBPIXEL_SHIFT; } @@ -36,38 +37,39 @@ namespace { int diff = abs(d_rightDisp[i * dst_pitch + k] - d); if (mask != 0 || diff > 1) { // masked or left-right inconsistent pixel -> invalid - d_leftDisp[i * dst_pitch + j] = (256 << (sgm::StereoSGM::SUBPIXEL_SHIFT+1)); + dout = (1024 << (sgm::StereoSGM::SUBPIXEL_SHIFT)); } } + d_leftDisp[i * dst_pitch + j] = dout; } } namespace sgm { namespace details { - void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) { + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream) { const dim3 blocks(width / 16, height / 16); const dim3 threads(16, 16); if (depth_bits == 16) { - check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp); } else if (depth_bits == 8) { - check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp); } CudaKernelCheck(); } - void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) { + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream) { const dim3 blocks(width / 16, height / 16); const dim3 threads(16, 16); if (depth_bits == 16) { - check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp); } else if (depth_bits == 8) { - check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp); } CudaKernelCheck(); diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu index 5eba5372c3d67e9396aaf9569c8e7240c0c98759..8047354b4a5ac07d7a6a2087caa876bd420b5422 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.cu +++ b/lib/libsgm/src/horizontal_path_aggregation.cu @@ -39,7 +39,8 @@ __global__ void aggregate_horizontal_path_kernel( const uint8_t* __restrict__ p2, int p2_pitch, const uint8_t* __restrict__ w, - int w_pitch) + int w_pitch, + int min_disp) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -87,7 +88,7 @@ __global__ void aggregate_horizontal_path_kernel( }else{ for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){ for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ - const int x = static_cast<int>(width - (j + dp_offset)); + const int x = static_cast<int>(width - (j + dp_offset)) - min_disp; if(0 <= x && x < static_cast<int>(width)){ right_buffer[i][j] = __ldg(&right[i * feature_step + x]); }else{ @@ -120,9 +121,9 @@ __global__ void aggregate_horizontal_path_kernel( #else right_buffer[j][0] = __shfl_up(t, 1, SUBGROUP_SIZE); #endif - if(lane_id == 0){ + if(lane_id == 0 && x >= min_disp + dp_offset){ right_buffer[j][0] = - __ldg(&right[j * feature_step + x - dp_offset]); + __ldg(&right[j * feature_step + x - min_disp - dp_offset]); } }else{ const feature_type t = right_buffer[j][0]; @@ -136,9 +137,9 @@ __global__ void aggregate_horizontal_path_kernel( right_buffer[j][DP_BLOCK_SIZE - 1] = __shfl_down(t, 1, SUBGROUP_SIZE); #endif if(lane_id + 1 == SUBGROUP_SIZE){ - if(x >= dp_offset + DP_BLOCK_SIZE - 1){ + if(x >= min_disp + dp_offset + DP_BLOCK_SIZE - 1){ right_buffer[j][DP_BLOCK_SIZE - 1] = - __ldg(&right[j * feature_step + x - (dp_offset + DP_BLOCK_SIZE - 1)]); + __ldg(&right[j * feature_step + x - min_disp - (dp_offset + DP_BLOCK_SIZE - 1)]); }else{ right_buffer[j][DP_BLOCK_SIZE - 1] = 0; } @@ -171,6 +172,7 @@ void enqueue_aggregate_left2right_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -180,7 +182,7 @@ void enqueue_aggregate_left2right_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } template <unsigned int MAX_DISPARITY> @@ -195,6 +197,7 @@ void enqueue_aggregate_right2left_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -204,7 +207,7 @@ void enqueue_aggregate_right2left_path( const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } @@ -219,6 +222,7 @@ template void enqueue_aggregate_left2right_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_left2right_path<128u>( @@ -232,6 +236,21 @@ template void enqueue_aggregate_left2right_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_left2right_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_left2right_path<256u>( @@ -245,6 +264,7 @@ template void enqueue_aggregate_left2right_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_right2left_path<64u>( @@ -258,6 +278,7 @@ template void enqueue_aggregate_right2left_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_right2left_path<128u>( @@ -271,6 +292,21 @@ template void enqueue_aggregate_right2left_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_right2left_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_right2left_path<256u>( @@ -284,6 +320,7 @@ template void enqueue_aggregate_right2left_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp index 1b7a7e088f5d4dac793694ba7f4a0a876de2538b..4626dd68f91909b2aab893fea906df5586acef72 100644 --- a/lib/libsgm/src/horizontal_path_aggregation.hpp +++ b/lib/libsgm/src/horizontal_path_aggregation.hpp @@ -34,6 +34,7 @@ void enqueue_aggregate_left2right_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -48,6 +49,7 @@ void enqueue_aggregate_right2left_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h index 7d836f5d8d5e60a0cb416ddb211b5abcfad0439e..c9c9ff259288c9b78a5b07595ce213ffa58bdada 100644 --- a/lib/libsgm/src/internal.h +++ b/lib/libsgm/src/internal.h @@ -36,8 +36,8 @@ namespace sgm { void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream); void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream); - void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream); - void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream); + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream); + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream); void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements); void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements); diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu index 97d9b1493c32e54a7bb1195f9a9261243b801444..3734ebae8922a370ea51165c431bc6396b30f51b 100644 --- a/lib/libsgm/src/oblique_path_aggregation.cu +++ b/lib/libsgm/src/oblique_path_aggregation.cu @@ -35,7 +35,8 @@ __global__ void aggregate_oblique_path_kernel( const uint8_t* __restrict__ p2, int p2_pitch, const uint8_t* __restrict__ w, - int w_pitch) + int w_pitch, + int min_disp) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -77,7 +78,7 @@ __global__ void aggregate_oblique_path_kernel( for(unsigned int iter = 0; iter < height; ++iter){ const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter); const int x = x0 + static_cast<int>(iter) * X_DIRECTION; - const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION; + const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION - min_disp; // Load right to smem for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){ const unsigned int i = i0 + threadIdx.x; @@ -129,6 +130,7 @@ void enqueue_aggregate_upleft2downright_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -137,7 +139,7 @@ void enqueue_aggregate_upleft2downright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } template <unsigned int MAX_DISPARITY> @@ -152,6 +154,7 @@ void enqueue_aggregate_upright2downleft_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -160,7 +163,7 @@ void enqueue_aggregate_upright2downleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } template <unsigned int MAX_DISPARITY> @@ -175,6 +178,7 @@ void enqueue_aggregate_downright2upleft_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -183,7 +187,7 @@ void enqueue_aggregate_downright2upleft_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } template <unsigned int MAX_DISPARITY> @@ -198,6 +202,7 @@ void enqueue_aggregate_downleft2upright_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -206,7 +211,7 @@ void enqueue_aggregate_downleft2upright_path( const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } @@ -221,6 +226,7 @@ template void enqueue_aggregate_upleft2downright_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_upleft2downright_path<128u>( @@ -234,6 +240,21 @@ template void enqueue_aggregate_upleft2downright_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_upleft2downright_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_upleft2downright_path<256u>( @@ -247,6 +268,7 @@ template void enqueue_aggregate_upleft2downright_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<64u>( @@ -260,6 +282,7 @@ template void enqueue_aggregate_upright2downleft_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<128u>( @@ -273,6 +296,21 @@ template void enqueue_aggregate_upright2downleft_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_upright2downleft_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_upright2downleft_path<256u>( @@ -286,6 +324,7 @@ template void enqueue_aggregate_upright2downleft_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<64u>( @@ -299,6 +338,7 @@ template void enqueue_aggregate_downright2upleft_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<128u>( @@ -312,6 +352,21 @@ template void enqueue_aggregate_downright2upleft_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_downright2upleft_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downright2upleft_path<256u>( @@ -325,6 +380,7 @@ template void enqueue_aggregate_downright2upleft_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<64u>( @@ -338,6 +394,7 @@ template void enqueue_aggregate_downleft2upright_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<128u>( @@ -351,6 +408,21 @@ template void enqueue_aggregate_downleft2upright_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_downleft2upright_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_downleft2upright_path<256u>( @@ -364,6 +436,7 @@ template void enqueue_aggregate_downleft2upright_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp index 6504f37a2715ef64d7698b00afb5f6b0614b26af..882afed2affc344ea35bfda20c619e7521738ab3 100644 --- a/lib/libsgm/src/oblique_path_aggregation.hpp +++ b/lib/libsgm/src/oblique_path_aggregation.hpp @@ -34,6 +34,7 @@ void enqueue_aggregate_upleft2downright_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -48,6 +49,7 @@ void enqueue_aggregate_upright2downleft_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -62,6 +64,7 @@ void enqueue_aggregate_downright2upleft_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -76,6 +79,7 @@ void enqueue_aggregate_downleft2upright_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu index e5019ec3a67ddb1113ce2a7f1a873983359e5fb4..5661d7bee452eb6d28486152c8dfa7bd75e5ecd1 100644 --- a/lib/libsgm/src/path_aggregation.cu +++ b/lib/libsgm/src/path_aggregation.cu @@ -53,6 +53,7 @@ void PathAggregation<MAX_DISPARITY>::enqueue( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS; @@ -69,28 +70,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue( path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>( m_cost_buffer.data() + 0 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[0]); path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>( m_cost_buffer.data() + 1 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[1]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[1]); path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>( m_cost_buffer.data() + 2 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[2]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[2]); path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>( m_cost_buffer.data() + 3 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[3]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[3]); path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>( m_cost_buffer.data() + 4 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[4]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[4]); path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 5 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[5]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[5]); path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>( m_cost_buffer.data() + 6 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[6]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[6]); path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>( m_cost_buffer.data() + 7 * buffer_step, - left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[7]); + left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[7]); for(unsigned int i = 0; i < NUM_PATHS; ++i){ cudaEventRecord(m_events[i], m_streams[i]); cudaStreamWaitEvent(stream, m_events[i], 0); @@ -100,6 +101,7 @@ void PathAggregation<MAX_DISPARITY>::enqueue( template class PathAggregation< 64>; template class PathAggregation<128>; +template class PathAggregation<192>; template class PathAggregation<256>; } diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp index 7df96996da46711f7ddcb76dcbc44132ca9f194c..c8abf740df09162ea65a01bc5bf0f041d2f2c6e7 100644 --- a/lib/libsgm/src/path_aggregation.hpp +++ b/lib/libsgm/src/path_aggregation.hpp @@ -51,6 +51,7 @@ public: int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); }; diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp index c897684e7003512ccd0c9e17d7a084a39319766b..5211c98103c26a27af51fbd91111043e16858d23 100644 --- a/lib/libsgm/src/path_aggregation_common.hpp +++ b/lib/libsgm/src/path_aggregation_common.hpp @@ -31,9 +31,9 @@ struct DynamicProgramming { static_assert( DP_BLOCK_SIZE >= 2, "DP_BLOCK_SIZE must be greater than or equal to 2"); - static_assert( + /*static_assert( (SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, - "SUBGROUP_SIZE must be a power of 2"); + "SUBGROUP_SIZE must be a power of 2");*/ uint32_t last_min; uint32_t dp[DP_BLOCK_SIZE]; diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu index 2b350da9562962aa9e10b4554bb6b6c4517e1023..eb5d4179e0a11b964e8238bbdcfac40598917396 100644 --- a/lib/libsgm/src/sgm.cu +++ b/lib/libsgm/src/sgm.cu @@ -57,6 +57,7 @@ public: int weights_pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { m_census_left.enqueue( @@ -70,11 +71,12 @@ public: penalty1, penalty2, src_pitch, // bug? weights, weights_pitch, + min_disp, stream); m_winner_takes_all.enqueue( dest_left, dest_right, m_path_aggregation.get_output(), - width, height, dst_pitch, uniqueness, subpixel, + width, height, dst_pitch, uniqueness, subpixel, min_disp, stream); } @@ -106,6 +108,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( int weights_pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { m_impl->enqueue( @@ -116,6 +119,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute( penalty1, penalty2, weights, weights_pitch, uniqueness, subpixel, + min_disp, stream); //cudaStreamSynchronize(0); } @@ -136,6 +140,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( int weights_pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { m_impl->enqueue( @@ -146,12 +151,14 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( penalty1, penalty2, weights, weights_pitch, uniqueness, subpixel, + min_disp, stream); } template class SemiGlobalMatching<uint8_t, 64>; template class SemiGlobalMatching<uint8_t, 128>; +template class SemiGlobalMatching<uint8_t, 192>; template class SemiGlobalMatching<uint8_t, 256>; template class SemiGlobalMatching<uint16_t, 64>; template class SemiGlobalMatching<uint16_t, 128>; diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp index f0c3c80ee107b95890d6dc8578c72768a70abdd4..9aa2cd387782465c3398aab5cda6a8b6a292018d 100644 --- a/lib/libsgm/src/sgm.hpp +++ b/lib/libsgm/src/sgm.hpp @@ -53,6 +53,7 @@ public: int weights_pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream); void enqueue( @@ -70,6 +71,7 @@ public: int weights_pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream); }; diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp index a07f2daea087e38f4102c0cb92bf7d1664655c3a..70e1263147fbe3ab4eefb948d591e8506a3be40b 100644 --- a/lib/libsgm/src/stereo_sgm.cpp +++ b/lib/libsgm/src/stereo_sgm.cpp @@ -29,7 +29,7 @@ namespace sgm { public: using output_type = sgm::output_type; virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) = 0; + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, int min_disp, cudaStream_t stream) = 0; virtual ~SemiGlobalMatchingBase() {} }; @@ -38,9 +38,9 @@ namespace sgm { class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase { public: void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, - int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) override + int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, int min_disp, cudaStream_t stream) override { - sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, stream); + sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, min_disp, stream); } private: SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_; @@ -63,6 +63,8 @@ namespace sgm { sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>(); else if (input_depth_bits_ == 8 && disparity_size_ == 128) sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>(); + else if (input_depth_bits_ == 8 && disparity_size_ == 192) + sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 192>(); else if (input_depth_bits_ == 8 && disparity_size_ == 256) sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 256>(); else if (input_depth_bits_ == 16 && disparity_size_ == 64) @@ -133,9 +135,9 @@ namespace sgm { width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; throw std::logic_error("depth bits must be 8 or 16"); } - if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 256) { + if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 192 && disparity_size_ != 256) { width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; - throw std::logic_error("disparity size must be 64, 128 or 256"); + throw std::logic_error("disparity size must be 64, 128, 192 or 256"); } if (param.subpixel && output_depth_bits != 16) { width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; @@ -150,7 +152,7 @@ namespace sgm { } void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, - const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) { + const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream) { const void *d_input_left, *d_input_right; @@ -174,11 +176,11 @@ namespace sgm { d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp, - d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, stream); + d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, min_disp, stream); sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream); sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream); - sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, stream); + sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, min_disp, stream); if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) { sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height); @@ -198,8 +200,8 @@ namespace sgm { } } - void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) { - execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, stream); + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream) { + execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, min_disp, stream); } bool StereoSGM::updateParameters(const Parameters ¶ms) { diff --git a/lib/libsgm/src/utility.hpp b/lib/libsgm/src/utility.hpp index 17b9f45f5375e1756f581a2d7fd3e72e5f588d60..bbd5dc328a878c241c9ceb4b3a57ff3162239794 100644 --- a/lib/libsgm/src/utility.hpp +++ b/lib/libsgm/src/utility.hpp @@ -189,6 +189,14 @@ __device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *pt dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w; } +template <> +__device__ inline void load_uint16_vector<6u>(uint32_t *dest, const uint16_t *ptr){ + const auto uint32x3 = load_as<uint3>(ptr); + load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x3.x)); + load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x3.y)); + load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x3.z)); +} + template <> __device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr){ const auto uint32x4 = load_as<uint4>(ptr); diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu index 6fee96892cf038a0583c576688e8456e0c3ec7d7..7f705646039d5a223ca967ecebdcdd70bb6b9667 100644 --- a/lib/libsgm/src/vertical_path_aggregation.cu +++ b/lib/libsgm/src/vertical_path_aggregation.cu @@ -35,7 +35,8 @@ __global__ void aggregate_vertical_path_kernel( const uint8_t* __restrict__ p2, int p2_pitch, const uint8_t* __restrict__ w, - int w_pitch) + int w_pitch, + int min_disp) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; @@ -81,7 +82,7 @@ __global__ void aggregate_vertical_path_kernel( for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){ const unsigned int i = i0 + threadIdx.x; if(i < RIGHT_BUFFER_SIZE){ - const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i); + const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i) - min_disp; feature_type right_value = 0; if(0 <= x && x < static_cast<int>(width)){ right_value = right[x + y * width]; @@ -126,6 +127,7 @@ void enqueue_aggregate_up2down_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -134,7 +136,7 @@ void enqueue_aggregate_up2down_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } template <unsigned int MAX_DISPARITY> @@ -149,6 +151,7 @@ void enqueue_aggregate_down2up_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream) { static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; @@ -157,7 +160,7 @@ void enqueue_aggregate_down2up_path( const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; const int bdim = BLOCK_SIZE; aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( - dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch); + dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp); } @@ -172,6 +175,7 @@ template void enqueue_aggregate_up2down_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_up2down_path<128u>( @@ -185,6 +189,7 @@ template void enqueue_aggregate_up2down_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_up2down_path<256u>( @@ -198,6 +203,21 @@ template void enqueue_aggregate_up2down_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_up2down_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_down2up_path<64u>( @@ -211,6 +231,7 @@ template void enqueue_aggregate_down2up_path<64u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_down2up_path<128u>( @@ -224,6 +245,21 @@ template void enqueue_aggregate_down2up_path<128u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, + cudaStream_t stream); + +template void enqueue_aggregate_down2up_path<192u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + const uint8_t *p2, + int p2_pitch, + const uint8_t* w, + int w_pitch, + int min_disp, cudaStream_t stream); template void enqueue_aggregate_down2up_path<256u>( @@ -237,6 +273,7 @@ template void enqueue_aggregate_down2up_path<256u>( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp index 434dd9231614b821ae1dfcbb439e648a9dd10b9a..18240cf264862e8d7417200c51d1e8584ec49d43 100644 --- a/lib/libsgm/src/vertical_path_aggregation.hpp +++ b/lib/libsgm/src/vertical_path_aggregation.hpp @@ -34,6 +34,7 @@ void enqueue_aggregate_up2down_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); template <unsigned int MAX_DISPARITY> @@ -48,6 +49,7 @@ void enqueue_aggregate_down2up_path( int p2_pitch, const uint8_t* w, int w_pitch, + int min_disp, cudaStream_t stream); } diff --git a/lib/libsgm/src/winner_takes_all.cu b/lib/libsgm/src/winner_takes_all.cu index f33d3005289a5041d03160af960032e221055e59..ea53af6c330d18bb203bfa94ce6a59e3a45c7159 100644 --- a/lib/libsgm/src/winner_takes_all.cu +++ b/lib/libsgm/src/winner_takes_all.cu @@ -47,17 +47,17 @@ __device__ int unpack_index(uint32_t packed){ return packed & 0xffffu; } -using ComputeDisparity = uint32_t(*)(uint32_t, uint32_t, uint16_t*); +using ComputeDisparity = uint32_t(*)(uint32_t, int, uint32_t, uint16_t*); -__device__ inline uint32_t compute_disparity_normal(uint32_t disp, uint32_t cost = 0, uint16_t* smem = nullptr) +__device__ inline uint32_t compute_disparity_normal(uint32_t disp, int min_disp, uint32_t cost = 0, uint16_t* smem = nullptr) { - return disp; + return disp; // + min_disp; } template <size_t MAX_DISPARITY> -__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, uint32_t cost, uint16_t* smem) +__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, int min_disp, uint32_t cost, uint16_t* smem) { - int subp = disp; + int subp = disp; // + min_disp; subp <<= sgm::StereoSGM::SUBPIXEL_SHIFT; if (disp > 0 && disp < MAX_DISPARITY - 1) { const int left = smem[disp - 1]; @@ -78,7 +78,8 @@ __global__ void winner_takes_all_kernel( int width, int height, int pitch, - float uniqueness) + float uniqueness, + int min_disp) { static const unsigned int ACCUMULATION_PER_THREAD = 16u; static const unsigned int REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE; @@ -178,7 +179,7 @@ __global__ void winner_takes_all_kernel( right_best[i] = min(right_best[i], recv); if(d == MAX_DISPARITY - 1){ if(0 <= p){ - right_dest[p] = compute_disparity_normal(unpack_index(right_best[i])); + right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]), min_disp); } right_best[i] = 0xffffffffu; } @@ -195,7 +196,7 @@ __global__ void winner_takes_all_kernel( } uniq = subgroup_and<WARP_SIZE>(uniq, 0xffffffffu); if(lane_id == 0){ - left_dest[x] = uniq ? compute_disparity(bestDisp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0; + left_dest[x] = uniq ? compute_disparity(bestDisp, min_disp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0; } } } @@ -204,7 +205,7 @@ __global__ void winner_takes_all_kernel( const unsigned int k = lane_id * REDUCTION_PER_THREAD + i; const int p = static_cast<int>(((width - k) & ~(MAX_DISPARITY - 1)) + k); if(p < width){ - right_dest[p] = compute_disparity_normal(unpack_index(right_best[i])); + right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]), min_disp); } } } @@ -219,6 +220,7 @@ void enqueue_winner_takes_all( int pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { const int gdim = @@ -226,10 +228,10 @@ void enqueue_winner_takes_all( const int bdim = BLOCK_SIZE; if (subpixel) { winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim, 0, stream>>>( - left_dest, right_dest, src, width, height, pitch, uniqueness); + left_dest, right_dest, src, width, height, pitch, uniqueness, min_disp); } else { winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_normal><<<gdim, bdim, 0, stream>>>( - left_dest, right_dest, src, width, height, pitch, uniqueness); + left_dest, right_dest, src, width, height, pitch, uniqueness, min_disp); } } @@ -250,6 +252,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue( int pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { if(m_left_buffer.size() < static_cast<size_t>(pitch * height)){ @@ -267,6 +270,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue( pitch, uniqueness, subpixel, + min_disp, stream); } @@ -280,6 +284,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue( int pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream) { enqueue_winner_takes_all<MAX_DISPARITY>( @@ -291,12 +296,14 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue( pitch, uniqueness, subpixel, + min_disp, stream); } template class WinnerTakesAll< 64>; template class WinnerTakesAll<128>; +template class WinnerTakesAll<192>; template class WinnerTakesAll<256>; } diff --git a/lib/libsgm/src/winner_takes_all.hpp b/lib/libsgm/src/winner_takes_all.hpp index 3dbae82735581c6b1051fd973efd4346e24c2543..f2bb59765f031322c9bdbcfa833dc170edea5653 100644 --- a/lib/libsgm/src/winner_takes_all.hpp +++ b/lib/libsgm/src/winner_takes_all.hpp @@ -47,6 +47,7 @@ public: int pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream); void enqueue( @@ -58,6 +59,7 @@ public: int pitch, float uniqueness, bool subpixel, + int min_disp, cudaStream_t stream); };