From 06f6155b1f51f111ebac6922f44edb0f3df2d36d Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Sun, 30 Aug 2020 18:46:34 +0300
Subject: [PATCH] Attempt to allow 192 disparities

---
 .../operators/src/disparity/fixstars_sgm.cpp  |  4 +-
 lib/libsgm/src/horizontal_path_aggregation.cu | 28 ++++++++++
 lib/libsgm/src/oblique_path_aggregation.cu    | 56 +++++++++++++++++++
 lib/libsgm/src/path_aggregation.cu            |  1 +
 lib/libsgm/src/path_aggregation_common.hpp    |  4 +-
 lib/libsgm/src/sgm.cu                         |  1 +
 lib/libsgm/src/stereo_sgm.cpp                 |  6 +-
 lib/libsgm/src/utility.hpp                    |  8 +++
 lib/libsgm/src/vertical_path_aggregation.cu   | 28 ++++++++++
 lib/libsgm/src/winner_takes_all.cu            |  1 +
 10 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index 453b0b0fe..9a904f9c7 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -62,7 +62,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) :
 	uniqueness_ = cfg->value("uniqueness", 0.95f);
 	P1_ = cfg->value("P1", 10);
 	P2_ = cfg->value("P2", 120);
-	max_disp_ = cfg->value("max_disp", 128);
+	max_disp_ = cfg->value("max_disp", 192);
 
 	if (uniqueness_ < 0.0 || uniqueness_ > 1.0) {
 		uniqueness_ = 1.0;
@@ -79,7 +79,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) :
 		LOG(WARNING) << "Invalid value for P2, using value of P1 (" << P1_ << ")";
 	}
 
-	if (!(max_disp_ == 256 || max_disp_ == 128)) {
+	if (!(max_disp_ == 256 || max_disp_ == 128 || max_disp_ == 192)) {
 		max_disp_ = 256;
 		LOG(WARNING) << "Invalid value for max_disp, using default value (256)";
 	}
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
index 11a94bc07..8047354b4 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.cu
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -239,6 +239,20 @@ template void enqueue_aggregate_left2right_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_left2right_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_left2right_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -281,6 +295,20 @@ template void enqueue_aggregate_right2left_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_right2left_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_right2left_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu
index 64dd9f947..3734ebae8 100644
--- a/lib/libsgm/src/oblique_path_aggregation.cu
+++ b/lib/libsgm/src/oblique_path_aggregation.cu
@@ -243,6 +243,20 @@ template void enqueue_aggregate_upleft2downright_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_upleft2downright_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_upleft2downright_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -285,6 +299,20 @@ template void enqueue_aggregate_upright2downleft_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_upright2downleft_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_upright2downleft_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -327,6 +355,20 @@ template void enqueue_aggregate_downright2upleft_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_downright2upleft_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_downright2upleft_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -369,6 +411,20 @@ template void enqueue_aggregate_downleft2upright_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_downleft2upright_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_downleft2upright_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
index ed7cdbee7..5661d7bee 100644
--- a/lib/libsgm/src/path_aggregation.cu
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -101,6 +101,7 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 
 template class PathAggregation< 64>;
 template class PathAggregation<128>;
+template class PathAggregation<192>;
 template class PathAggregation<256>;
 
 }
diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp
index c897684e7..5211c9810 100644
--- a/lib/libsgm/src/path_aggregation_common.hpp
+++ b/lib/libsgm/src/path_aggregation_common.hpp
@@ -31,9 +31,9 @@ struct DynamicProgramming {
 	static_assert(
 		DP_BLOCK_SIZE >= 2,
 		"DP_BLOCK_SIZE must be greater than or equal to 2");
-	static_assert(
+	/*static_assert(
 		(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,
-		"SUBGROUP_SIZE must be a power of 2");
+		"SUBGROUP_SIZE must be a power of 2");*/
 
 	uint32_t last_min;
 	uint32_t dp[DP_BLOCK_SIZE];
diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu
index b13d13e07..eb5d4179e 100644
--- a/lib/libsgm/src/sgm.cu
+++ b/lib/libsgm/src/sgm.cu
@@ -158,6 +158,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 
 template class SemiGlobalMatching<uint8_t,   64>;
 template class SemiGlobalMatching<uint8_t,  128>;
+template class SemiGlobalMatching<uint8_t,  192>;
 template class SemiGlobalMatching<uint8_t,  256>;
 template class SemiGlobalMatching<uint16_t,  64>;
 template class SemiGlobalMatching<uint16_t, 128>;
diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp
index 58ba52284..70e126314 100644
--- a/lib/libsgm/src/stereo_sgm.cpp
+++ b/lib/libsgm/src/stereo_sgm.cpp
@@ -63,6 +63,8 @@ namespace sgm {
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>();
 			else if (input_depth_bits_ == 8 && disparity_size_ == 128)
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>();
+			else if (input_depth_bits_ == 8 && disparity_size_ == 192)
+				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 192>();
 			else if (input_depth_bits_ == 8 && disparity_size_ == 256)
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 256>();
 			else if (input_depth_bits_ == 16 && disparity_size_ == 64)
@@ -133,9 +135,9 @@ namespace sgm {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
 			throw std::logic_error("depth bits must be 8 or 16");
 		}
-		if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 256) {
+		if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 192 && disparity_size_ != 256) {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
-			throw std::logic_error("disparity size must be 64, 128 or 256");
+			throw std::logic_error("disparity size must be 64, 128, 192 or 256");
 		}
 		if (param.subpixel && output_depth_bits != 16) {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
diff --git a/lib/libsgm/src/utility.hpp b/lib/libsgm/src/utility.hpp
index 17b9f45f5..bbd5dc328 100644
--- a/lib/libsgm/src/utility.hpp
+++ b/lib/libsgm/src/utility.hpp
@@ -189,6 +189,14 @@ __device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *pt
 	dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w;
 }
 
+template <>
+__device__ inline void load_uint16_vector<6u>(uint32_t *dest, const uint16_t *ptr){
+	const auto uint32x3 = load_as<uint3>(ptr);
+	load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x3.x));
+	load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x3.y));
+	load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x3.z));
+}
+
 template <>
 __device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr){
 	const auto uint32x4 = load_as<uint4>(ptr);
diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu
index 9bf529774..7f7056460 100644
--- a/lib/libsgm/src/vertical_path_aggregation.cu
+++ b/lib/libsgm/src/vertical_path_aggregation.cu
@@ -206,6 +206,20 @@ template void enqueue_aggregate_up2down_path<256u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_up2down_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_down2up_path<64u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -234,6 +248,20 @@ template void enqueue_aggregate_down2up_path<128u>(
 	int min_disp,
 	cudaStream_t stream);
 
+template void enqueue_aggregate_down2up_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
 template void enqueue_aggregate_down2up_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
diff --git a/lib/libsgm/src/winner_takes_all.cu b/lib/libsgm/src/winner_takes_all.cu
index 1e82fcb65..ea53af6c3 100644
--- a/lib/libsgm/src/winner_takes_all.cu
+++ b/lib/libsgm/src/winner_takes_all.cu
@@ -303,6 +303,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue(
 
 template class WinnerTakesAll< 64>;
 template class WinnerTakesAll<128>;
+template class WinnerTakesAll<192>;
 template class WinnerTakesAll<256>;
 
 }
-- 
GitLab